From b102a71b35a81990d266dadb05d49aa43fe1bb5a Mon Sep 17 00:00:00 2001 From: Jakub Wieczorek Date: Sun, 2 Oct 2022 16:24:26 +0200 Subject: [PATCH] Extract out inline scripts into separate files. --- .github/workflows/issue_created.yml | 180 +--------------------------- scripts/extract_captions.js | 98 +++++++++++++++ scripts/save_metadata.js | 24 ++++ scripts/transcribe_with_whisper.js | 61 ++++++++++ 4 files changed, 189 insertions(+), 174 deletions(-) create mode 100644 scripts/extract_captions.js create mode 100644 scripts/save_metadata.js create mode 100644 scripts/transcribe_with_whisper.js diff --git a/.github/workflows/issue_created.yml b/.github/workflows/issue_created.yml index 53b9b1c..6d25be9 100644 --- a/.github/workflows/issue_created.yml +++ b/.github/workflows/issue_created.yml @@ -20,128 +20,16 @@ jobs: name: Save metadata with: script: | - // yt-dlp 'https://www.youtube.com/watch?v=...' --skip-download --write-info-json - const child_process = require('child_process'); - const fs = require('fs'); - const body = context.payload.issue.body; - let match = body.match(/https?:\/\/[^\s]+/); - // Ignore github.com URLs - if (match && match[0].includes('https://github.com')) { - match = null; - } - if (!match) { - return; - } - const url = match[0]; - const number = context.issue.number.toString(); - child_process.spawnSync('mkdir', ['-p', `${number}/metadata-temp`]); - child_process.spawnSync('mkdir', ['-p', `${number}/metadata`]); - child_process.spawnSync('yt-dlp', [url, '--skip-download', '--write-info-json'], { - cwd: `${number}/metadata-temp` - }); - // Rename to info.json - child_process.exec(`cat ${number}/metadata-temp/*.json | jq 'del(.automatic_captions)' -c > ${number}/metadata/info.json`); - child_process.exec(`rm ${number}/metadata-temp/*.json`); + const script = require('./scripts/save_metadata.js') + script(context) - uses: actions/github-script@v6 name: Extract captions # Only run this step if the issue is labeled with "captions" if: contains(github.event.issue.labels.*.name, 'captions') with: script: | - const child_process = require('child_process'); - const fs = require('fs'); - const body = context.payload.issue.body; - let match = body.match(/https?:\/\/[^\s]+/); - // Ignore github.com URLs - if (match && match[0].includes('https://github.com')) { - match = null; - } - if (!match) { - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: '🚫 No URL found in issue body.' - }); - return; - } - const url = match[0]; - // Create directory named after the issue number - const number = context.issue.number.toString(); - child_process.spawnSync('mkdir', ['-p', number]); - // Create subdirectories subs, auto, whisper - child_process.spawnSync('mkdir', ['-p', `${number}/subs`]); - child_process.spawnSync('mkdir', ['-p', `${number}/auto`]); - let results = []; - - function run(command, args, cwd) { - // Returns [status stdout, stderr] - const options = { - cwd: cwd, - encoding: 'utf-8', - }; - const result = child_process.spawnSync(command, args, options); - return [result.status, result.stdout, result.stderr]; - } - results.push(run('yt-dlp', ['--all-subs', '--skip-download', '--sub-format', 'ttml/vtt/best', url], `${number}/subs`)); - // Did that create any files? - const files = child_process.spawnSync('ls', [`${number}/subs`]); - if (files.stdout.toString().trim() === '') { - // Use --write-auto-sub - results.push(run('yt-dlp', ['--write-auto-sub', '--skip-download', '--sub-format', 'ttml/vtt/best', url], `${number}/auto`)); - // Now delete all but the `ru` and `en` files - const autoFiles = fs.readdirSync(`${number}/auto`); - autoFiles.forEach(file => { - if (!file.includes('.ru.') && !file.includes('.en.')) { - fs.unlinkSync(`${number}/auto/${file}`); - } - }); - } - const comment = results.map(result => { - return `\`\`\` + ${result[0]}\n${result[1]}\n${result[2]}\`\`\``; - }).join('\n'); - // Post the output as a comment - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: comment, - }); - // Generate a comment with the result from .ttml or .vtt files - for (const format of ['ttml', 'vtt']) { - const globber = await glob.create(`${number}/**/*.${format}`); - const foundFiles = await globber.glob(); - if (foundFiles.length) { - // Pass it through XXXX-to-json - let args = [foundFiles[0]]; - let command = null; - if (format === 'vtt') { - args.push('--dedupe'); - command = 'webvtt-to-json'; - } else { - command = 'ttml-to-json'; - } - const captions = JSON.parse( - child_process.spawnSync(command, args, { - encoding: 'utf-8', - } - ).stdout); - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: '```\n' + captions.map(caption => caption.lines.join('\n')).join('\n') + '\n```', - }); - break; - } - } - // Close the issue - github.rest.issues.update({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - state: 'closed' - }); + const script = require('./scripts/extract_captions.js') + await script({ context, github, glob }) - uses: actions/github-script@v6 name: Run it through whisper # Only run this step if the issue is labeled with "whisper" @@ -150,64 +38,8 @@ jobs: REPLICATE_API_TOKEN: ${{ secrets.REPLICATE_API_TOKEN }} with: script: | - const child_process = require('child_process'); - const fs = require('fs'); - const body = context.payload.issue.body; - let match = body.match(/https?:\/\/[^\s]+/); - // Ignore github.com URLs - if (match && match[0].includes('https://github.com')) { - match = null; - } - if (!match) { - return; - } - const url = match[0]; - // Create directory named after the issue number - const number = context.issue.number.toString(); - child_process.spawnSync('mkdir', ['-p', `${number}/whisper`]); - // Use yt-dlp to extract the audio - console.log(child_process.spawnSync('yt-dlp', [ - '-x', url, '--audio-format', 'mp3', '--output', `${number}/whisper/audio.%(ext)s` - ], { - encoding: 'utf-8', - }).stdout); - console.log(child_process.spawnSync('find', [number], { - encoding: 'utf-8', - }).stdout); - // Run it through whisper - console.log(child_process.spawnSync('python', [ - 'transcribe_audio.py', `${number}/whisper/audio.mp3`, `${number}/whisper/transcription.json` - ], { - encoding: 'utf-8', - }).stderr); - // Now delete the audio file so we don't check it into the repo - fs.unlinkSync(`${number}/whisper/audio.mp3`); - // Load JSON from transcription.json - const transcription = JSON.parse(fs.readFileSync(`${number}/whisper/transcription.json`)); - let comment = ''; - if (transcription.detected_language) { - comment += `Language: ${transcription.detected_language}\n\n`; - } - if (transcription.transcription) { - comment += 'Transcription: `' + transcription.transcription + '`\n\n'; - } - if (transcription.translation) { - comment += 'Translation: `' + transcription.translation + '`\n\n'; - } - // Post the output as a comment - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: comment - }); - // Close the issue - github.rest.issues.update({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - state: 'closed' - }); + const script = require('./scripts/transcribe_with_whisper.js') + script({ context, github }) - name: Commit and push run: |- git config user.name "Automated" diff --git a/scripts/extract_captions.js b/scripts/extract_captions.js new file mode 100644 index 0000000..282ac88 --- /dev/null +++ b/scripts/extract_captions.js @@ -0,0 +1,98 @@ +const child_process = require('child_process'); +const fs = require('fs'); + +function run(command, args, cwd) { + // Returns [status stdout, stderr] + const options = { + cwd: cwd, + encoding: 'utf-8', + }; + const result = child_process.spawnSync(command, args, options); + return [result.status, result.stdout, result.stderr]; +} + +module.exports = async ({ context, github, glob }) => { + const body = context.payload.issue.body; + let match = body.match(/https?:\/\/[^\s]+/); + // Ignore github.com URLs + if (match && match[0].includes('https://github.com')) { + match = null; + } + if (!match) { + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: '🚫 No URL found in issue body.' + }); + return; + } + const url = match[0]; + // Create directory named after the issue number + const number = context.issue.number.toString(); + child_process.spawnSync('mkdir', ['-p', number]); + // Create subdirectories subs, auto, whisper + child_process.spawnSync('mkdir', ['-p', `${number}/subs`]); + child_process.spawnSync('mkdir', ['-p', `${number}/auto`]); + let results = []; + + results.push(run('yt-dlp', ['--all-subs', '--skip-download', '--sub-format', 'ttml/vtt/best', url], `${number}/subs`)); + // Did that create any files? + const files = child_process.spawnSync('ls', [`${number}/subs`]); + if (files.stdout.toString().trim() === '') { + // Use --write-auto-sub + results.push(run('yt-dlp', ['--write-auto-sub', '--skip-download', '--sub-format', 'ttml/vtt/best', url], `${number}/auto`)); + // Now delete all but the `ru` and `en` files + const autoFiles = fs.readdirSync(`${number}/auto`); + autoFiles.forEach(file => { + if (!file.includes('.ru.') && !file.includes('.en.')) { + fs.unlinkSync(`${number}/auto/${file}`); + } + }); + } + const comment = results.map(result => { + return `\`\`\` + ${result[0]}\n${result[1]}\n${result[2]}\`\`\``; + }).join('\n'); + // Post the output as a comment + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: comment, + }); + // Generate a comment with the result from .ttml or .vtt files + for (const format of ['ttml', 'vtt']) { + const globber = await glob.create(`${number}/**/*.${format}`); + const foundFiles = await globber.glob(); + if (foundFiles.length) { + // Pass it through XXXX-to-json + let args = [foundFiles[0]]; + let command = null; + if (format === 'vtt') { + args.push('--dedupe'); + command = 'webvtt-to-json'; + } else { + command = 'ttml-to-json'; + } + const captions = JSON.parse( + child_process.spawnSync(command, args, { + encoding: 'utf-8', + } + ).stdout); + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: '```\n' + captions.map(caption => caption.lines.join('\n')).join('\n') + '\n```', + }); + break; + } + } + // Close the issue + github.rest.issues.update({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + state: 'closed' + }); +} diff --git a/scripts/save_metadata.js b/scripts/save_metadata.js new file mode 100644 index 0000000..f075ea1 --- /dev/null +++ b/scripts/save_metadata.js @@ -0,0 +1,24 @@ +const child_process = require('child_process'); + +module.exports = (context) => { + // yt-dlp 'https://www.youtube.com/watch?v=...' --skip-download --write-info-json + const body = context.payload.issue.body; + let match = body.match(/https?:\/\/[^\s]+/); + // Ignore github.com URLs + if (match && match[0].includes('https://github.com')) { + match = null; + } + if (!match) { + return; + } + const url = match[0]; + const number = context.issue.number.toString(); + child_process.spawnSync('mkdir', ['-p', `${number}/metadata-temp`]); + child_process.spawnSync('mkdir', ['-p', `${number}/metadata`]); + child_process.spawnSync('yt-dlp', [url, '--skip-download', '--write-info-json'], { + cwd: `${number}/metadata-temp` + }); + // Rename to info.json + child_process.exec(`cat ${number}/metadata-temp/*.json | jq 'del(.automatic_captions)' -c > ${number}/metadata/info.json`); + child_process.exec(`rm ${number}/metadata-temp/*.json`); +} diff --git a/scripts/transcribe_with_whisper.js b/scripts/transcribe_with_whisper.js new file mode 100644 index 0000000..822e854 --- /dev/null +++ b/scripts/transcribe_with_whisper.js @@ -0,0 +1,61 @@ +const child_process = require('child_process'); +const fs = require('fs'); + +module.exports = ({ context, github }) => { + const body = context.payload.issue.body; + let match = body.match(/https?:\/\/[^\s]+/); + // Ignore github.com URLs + if (match && match[0].includes('https://github.com')) { + match = null; + } + if (!match) { + return; + } + const url = match[0]; + // Create directory named after the issue number + const number = context.issue.number.toString(); + child_process.spawnSync('mkdir', ['-p', `${number}/whisper`]); + // Use yt-dlp to extract the audio + console.log(child_process.spawnSync('yt-dlp', [ + '-x', url, '--audio-format', 'mp3', '--output', `${number}/whisper/audio.%(ext)s` + ], { + encoding: 'utf-8', + }).stdout); + console.log(child_process.spawnSync('find', [number], { + encoding: 'utf-8', + }).stdout); + // Run it through whisper + console.log(child_process.spawnSync('python', [ + 'transcribe_audio.py', `${number}/whisper/audio.mp3`, `${number}/whisper/transcription.json` + ], { + encoding: 'utf-8', + }).stderr); + // Now delete the audio file so we don't check it into the repo + fs.unlinkSync(`${number}/whisper/audio.mp3`); + // Load JSON from transcription.json + const transcription = JSON.parse(fs.readFileSync(`${number}/whisper/transcription.json`)); + let comment = ''; + if (transcription.detected_language) { + comment += `Language: ${transcription.detected_language}\n\n`; + } + if (transcription.transcription) { + comment += 'Transcription: `' + transcription.transcription + '`\n\n'; + } + if (transcription.translation) { + comment += 'Translation: `' + transcription.translation + '`\n\n'; + } + // Post the output as a comment + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: comment + }); + // Close the issue + github.rest.issues.update({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + state: 'closed' + }); +}