diff --git a/notebooks/Name suggestion tensor2tensor.ipynb b/notebooks/Name suggestion tensor2tensor.ipynb new file mode 100644 index 0000000..c56369d --- /dev/null +++ b/notebooks/Name suggestion tensor2tensor.ipynb @@ -0,0 +1,5284 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "colab": { + "name": "Name suggestions - tensor2tensor.ipynb", + "provenance": [], + "collapsed_sections": [ + "3AM6HYxO8QIX", + "C6DS7t_E8QJF", + "t06MnGcJeN4V", + "bs94qZOdSq-v", + "3RYV2w1RS1MC" + ] + }, + "accelerator": "TPU" + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2019-10-25T20:53:35.718194Z", + "start_time": "2019-10-25T20:53:35.708956Z" + }, + "id": "NDnsvpoS8QIB", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!pip3 install --no-cache-dir --upgrade --force-reinstall tensorflow==1.14 #tensorflow-gpu\n", + "!pip3 install --no-cache-dir --upgrade --force-reinstall tensor2tensor" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2019-10-26T21:07:50.506864Z", + "start_time": "2019-10-26T21:07:44.537647Z" + }, + "id": "D3EzHaXT8QIK", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#@title Run this only once - Sets up TF Eager execution\n", + "import tensorflow as tf\n", + "\n", + "# t2t supports only TF 1.14 ATM\n", + "print(tf.__version__)\n", + "\n", + "# Enable Eager execution - useful for seeing the generated data.\n", + "tf.enable_eager_execution()\n", + "\n", + "# Set a seed so that we have deterministic outputs.\n", + "from tensor2tensor.utils import trainer_lib\n", + "RANDOM_SEED = 301\n", + "trainer_lib.set_random_seed(RANDOM_SEED)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Bix745NV8QIQ", + "colab_type": "text" + }, + "source": [ + "# Function name suggestion\n", + "\n", + "Given the _function body_ text, suggest a _function name_.\n", + "\n", + "Use existing multi-lingual dataset of ~3m functions from Github CodeSearchNet (500k Java).\n", + "\n", + "Q: best way to compare results:\n", + " - \\w identifier-based predictions?\n", + " - \\w literature" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-10-26T13:06:32.957984Z", + "start_time": "2019-10-26T13:06:32.930510Z" + }, + "id": "3AM6HYxO8QIX", + "colab_type": "text" + }, + "source": [ + "## Data: exploration" + ] + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2019-10-26T21:36:03.517164Z", + "start_time": "2019-10-26T21:36:03.456810Z" + }, + "id": "IIkpuq9e8QIZ", + "colab_type": "code", + "outputId": "05cbe251-d754-4fd8-a22c-3fcf9d934897", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "from pathlib import Path\n", + "\n", + "java_files = sorted(Path('/devfest/repos/name-suggestion/tmp/java/').glob('**/*.gz'))\n", + "print(f'Total number of files: {len(java_files):,}')\n", + "\n", + "columns_long_list = ['repo', 'path', 'url', 'code', \n", + " 'code_tokens', 'func_name', \n", + " 'language', 'partition']\n", + "\n", + "def jsonl_list_to_dataframe(file_list, columns=columns_long_list):\n", + " \"\"\"Load a list of jsonl.gz files into a pandas DataFrame.\"\"\"\n", + " return pd.concat([pd.read_json(f, \n", + " orient='records', \n", + " compression='gzip',\n", + " lines=True)[columns] \n", + " for f in file_list], sort=False)\n" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Total number of files: 18\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2019-10-26T21:36:57.588322Z", + "start_time": "2019-10-26T21:36:08.461218Z" + }, + "id": "pGZ8nw638QId", + "colab_type": "code", + "colab": {} + }, + "source": [ + "jdf = jsonl_list_to_dataframe(java_files)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2019-10-26T21:36:57.649841Z", + "start_time": "2019-10-26T21:36:57.592537Z" + }, + "id": "NbMraR-08QIi", + "colab_type": "code", + "outputId": "03e3e75f-c232-4019-e66b-d9e2a58d115a", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 318 + } + }, + "source": [ + "jdf.head(3)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + " | repo | \n", + "path | \n", + "url | \n", + "code | \n", + "code_tokens | \n", + "func_name | \n", + "language | \n", + "partition | \n", + "
---|---|---|---|---|---|---|---|---|
0 | \n", + "ReactiveX/RxJava | \n", + "src/main/java/io/reactivex/internal/observers/... | \n", + "https://github.com/ReactiveX/RxJava/blob/ac841... | \n", + "protected final void fastPathOrderedEmit(U val... | \n", + "[protected, final, void, fastPathOrderedEmit, ... | \n", + "QueueDrainObserver.fastPathOrderedEmit | \n", + "java | \n", + "test | \n", + "
1 | \n", + "ReactiveX/RxJava | \n", + "src/main/java/io/reactivex/Observable.java | \n", + "https://github.com/ReactiveX/RxJava/blob/ac841... | \n", + "@CheckReturnValue\\n @NonNull\\n @Schedule... | \n", + "[@, CheckReturnValue, @, NonNull, @, Scheduler... | \n", + "Observable.amb | \n", + "java | \n", + "test | \n", + "
2 | \n", + "ReactiveX/RxJava | \n", + "src/main/java/io/reactivex/Observable.java | \n", + "https://github.com/ReactiveX/RxJava/blob/ac841... | \n", + "@SuppressWarnings(\"unchecked\")\\n @CheckRetu... | \n", + "[@, SuppressWarnings, (, \"unchecked\", ), @, Ch... | \n", + "Observable.ambArray | \n", + "java | \n", + "test | \n", + "
\n", + " | func_name_len | \n", + "
---|---|
0.50 | \n", + "29.0 | \n", + "
0.70 | \n", + "35.0 | \n", + "
0.80 | \n", + "39.0 | \n", + "
0.90 | \n", + "46.0 | \n", + "
0.95 | \n", + "51.0 | \n", + "
1.00 | \n", + "140.0 | \n", + "
\n", + " | code_len | \n", + "
---|---|
0.50 | \n", + "66.0 | \n", + "
0.70 | \n", + "104.0 | \n", + "
0.80 | \n", + "142.0 | \n", + "
0.90 | \n", + "224.0 | \n", + "
0.95 | \n", + "331.0 | \n", + "
1.00 | \n", + "68278.0 | \n", + "
\n", + " | code | \n", + "code_tokens | \n", + "
---|---|---|
23235 | \n", + "@Given(\"ich habe eine Aktion mit dem Symbol $sümbol und eine Schwelle von $threshold\")\\n public void aStock(@Named(\"sümbol\") String symbol, @Named(\"threshold\") double threshold) {\\n stock = new Stock(symbol, threshold);\\n } | \n", + "[@, Given, (, \"ich habe eine Aktion mit dem Symbol $sümbol und eine Schwelle von $threshold\"), , public, void, aStock, (, @, Named, (, \"sümbol\"), , tring , ymbol,, , N, amed(, \", threshold\"), , ouble , hreshold), , , stock, =, new, Stock, (, symbol, ,, threshold, ), ;, }] | \n", + "
\n", + " | repo | \n", + "path | \n", + "url | \n", + "code | \n", + "code_tokens | \n", + "func_name | \n", + "language | \n", + "partition | \n", + "
---|---|---|---|---|---|---|---|---|
9186 | \n", + "networknt/light-4j | \n", + "config/src/main/java/com/networknt/config/ConfigInjection.java | \n", + "https://github.com/networknt/light-4j/blob/2a60257c60663684c8f6dc8b5ea3cf184e534db6/config/src/main/java/com/networknt/config/ConfigInjection.java#L79-L84 | \n", + "public static boolean isExclusionConfigFile(String configName) {\\n List<Object> exclusionConfigFileList = (exclusionMap == null) ? new ArrayList<>() : (List<Object>) exclusionMap.get(EXCLUSION_CONFIG_FILE_LIST);\\n return CENTRALIZED_MANAGEMENT.equals(configName)\\n || SCALABLE_CONFIG.equals(configName)\\n || exclusionConfigFileList.contains(configName);\\n } | \n", + "[public, static, boolean, isExclusionConfigFile, (, String, configName, ), {, List, <, Object, >, exclusionConfigFileList, =, (, exclusionMap, ==, null, ), ?, new, ArrayList, <>, (, ), :, (, List, <, Object, >, ), exclusionMap, ., get, (, EXCLUSION_CONFIG_FILE_LIST, ), ;, return, CENTRALIZED_MANAGEMENT, ., equals, (, configName, ), ||, SCALABLE_CONFIG, ., equals, (, configName, ), ||, exclusionConfigFileList, ., contains, (, configName, ), ;, }] | \n", + "ConfigInjection.isExclusionConfigFile | \n", + "java | \n", + "train | \n", + "
26452 | \n", + "icode/ameba | \n", + "src/main/java/ameba/feature/datasource/WebStatFilter.java | \n", + "https://github.com/icode/ameba/blob/9d4956e935898e41331b2745e400ef869cd265e0/src/main/java/ameba/feature/datasource/WebStatFilter.java#L314-L333 | \n", + "public boolean isExclusion(String requestURI) {\\n if (excludesPattern == null) {\\n return false;\\n }\\n\\n if (contextPath != null && requestURI.startsWith(contextPath)) {\\n requestURI = requestURI.substring(contextPath.length());\\n if (!requestURI.startsWith(\"/\")) {\\n requestURI = \"/\" + requestURI;\\n }\\n }\\n\\n for (String pattern : excludesPattern) {\\n if (pathMatcher.matches(pattern, requestURI)) {\\n return true;\\n }\\n }\\n\\n return false;\\n } | \n", + "[public, boolean, isExclusion, (, String, requestURI, ), {, if, (, excludesPattern, ==, null, ), {, return, false, ;, }, if, (, contextPath, !=, null, &&, requestURI, ., startsWith, (, contextPath, ), ), {, requestURI, =, requestURI, ., substring, (, contextPath, ., length, (, ), ), ;, if, (, !, requestURI, ., startsWith, (, \"/\", ), ), {, requestURI, =, \"/\", +, requestURI, ;, }, }, for, (, String, pattern, :, excludesPattern, ), {, if, (, pathMatcher, ., matches, (, pattern, ,, requestURI, ), ), {, return, true, ;, }, }, return, false, ;, }] | \n", + "WebStatFilter.isExclusion | \n", + "java | \n", + "train | \n", + "
3025 | \n", + "nutzam/nutz | \n", + "src/org/nutz/mvc/NutFilter.java | \n", + "https://github.com/nutzam/nutz/blob/a38694d5cbda2692e7931ab093c168487a6a4bfe/src/org/nutz/mvc/NutFilter.java#L151-L169 | \n", + "protected boolean isExclusion(String matchUrl) throws IOException, ServletException {\\n \\tif (ignorePtn != null && ignorePtn.matcher(matchUrl).find()) {\\n \\t\\treturn true;\\n \\t}\\n \\tif (exclusionsSuffix != null) {\\n \\t\\tif (exclusionsSuffix.matcher(matchUrl).find()) {\\n\\t \\t\\treturn true;\\n \\t\\t}\\n \\t}\\n \\tif (exclusionsPrefix != null) {\\n \\t\\tif (exclusionsPrefix.matcher(matchUrl).find()) {\\n\\t \\t\\treturn true;\\n \\t\\t}\\n \\t}\\n \\tif (exclusionPaths != null && exclusionPaths.contains(matchUrl)) {\\n \\t\\treturn true;\\n \\t}\\n \\treturn false;\\n } | \n", + "[protected, boolean, isExclusion, (, String, matchUrl, ), throws, IOException, ,, ServletException, {, if, (, ignorePtn, !=, null, &&, ignorePtn, ., matcher, (, matchUrl, ), ., find, (, ), ), {, return, true, ;, }, if, (, exclusionsSuffix, !=, null, ), {, if, (, exclusionsSuffix, ., matcher, (, matchUrl, ), ., find, (, ), ), {, return, true, ;, }, }, if, (, exclusionsPrefix, !=, null, ), {, if, (, exclusionsPrefix, ., matcher, (, matchUrl, ), ., find, (, ), ), {, return, true, ;, }, }, if, (, exclusionPaths, !=, null, &&, exclusionPaths, ., contains, (, matchUrl, ), ), {, ...] | \n", + "NutFilter.isExclusion | \n", + "java | \n", + "valid | \n", + "