Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 6 additions & 174 deletions .github/workflows/issue_created.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,128 +20,16 @@ jobs:
name: Save metadata
with:
script: |
// yt-dlp 'https://www.youtube.com/watch?v=...' --skip-download --write-info-json
const child_process = require('child_process');
const fs = require('fs');
const body = context.payload.issue.body;
let match = body.match(/https?:\/\/[^\s]+/);
// Ignore github.com URLs
if (match && match[0].includes('https://github.com')) {
match = null;
}
if (!match) {
return;
}
const url = match[0];
const number = context.issue.number.toString();
child_process.spawnSync('mkdir', ['-p', `${number}/metadata-temp`]);
child_process.spawnSync('mkdir', ['-p', `${number}/metadata`]);
child_process.spawnSync('yt-dlp', [url, '--skip-download', '--write-info-json'], {
cwd: `${number}/metadata-temp`
});
// Rename to info.json
child_process.exec(`cat ${number}/metadata-temp/*.json | jq 'del(.automatic_captions)' -c > ${number}/metadata/info.json`);
child_process.exec(`rm ${number}/metadata-temp/*.json`);
const script = require('./scripts/save_metadata.js')
script(context)
- uses: actions/github-script@v6
name: Extract captions
# Only run this step if the issue is labeled with "captions"
if: contains(github.event.issue.labels.*.name, 'captions')
with:
script: |
const child_process = require('child_process');
const fs = require('fs');
const body = context.payload.issue.body;
let match = body.match(/https?:\/\/[^\s]+/);
// Ignore github.com URLs
if (match && match[0].includes('https://github.com')) {
match = null;
}
if (!match) {
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: '🚫 No URL found in issue body.'
});
return;
}
const url = match[0];
// Create directory named after the issue number
const number = context.issue.number.toString();
child_process.spawnSync('mkdir', ['-p', number]);
// Create subdirectories subs, auto, whisper
child_process.spawnSync('mkdir', ['-p', `${number}/subs`]);
child_process.spawnSync('mkdir', ['-p', `${number}/auto`]);
let results = [];

function run(command, args, cwd) {
// Returns [status stdout, stderr]
const options = {
cwd: cwd,
encoding: 'utf-8',
};
const result = child_process.spawnSync(command, args, options);
return [result.status, result.stdout, result.stderr];
}
results.push(run('yt-dlp', ['--all-subs', '--skip-download', '--sub-format', 'ttml/vtt/best', url], `${number}/subs`));
// Did that create any files?
const files = child_process.spawnSync('ls', [`${number}/subs`]);
if (files.stdout.toString().trim() === '') {
// Use --write-auto-sub
results.push(run('yt-dlp', ['--write-auto-sub', '--skip-download', '--sub-format', 'ttml/vtt/best', url], `${number}/auto`));
// Now delete all but the `ru` and `en` files
const autoFiles = fs.readdirSync(`${number}/auto`);
autoFiles.forEach(file => {
if (!file.includes('.ru.') && !file.includes('.en.')) {
fs.unlinkSync(`${number}/auto/${file}`);
}
});
}
const comment = results.map(result => {
return `\`\`\` + ${result[0]}\n${result[1]}\n${result[2]}\`\`\``;
}).join('\n');
// Post the output as a comment
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment,
});
// Generate a comment with the result from .ttml or .vtt files
for (const format of ['ttml', 'vtt']) {
const globber = await glob.create(`${number}/**/*.${format}`);
const foundFiles = await globber.glob();
if (foundFiles.length) {
// Pass it through XXXX-to-json
let args = [foundFiles[0]];
let command = null;
if (format === 'vtt') {
args.push('--dedupe');
command = 'webvtt-to-json';
} else {
command = 'ttml-to-json';
}
const captions = JSON.parse(
child_process.spawnSync(command, args, {
encoding: 'utf-8',
}
).stdout);
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: '```\n' + captions.map(caption => caption.lines.join('\n')).join('\n') + '\n```',
});
break;
}
}
// Close the issue
github.rest.issues.update({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
state: 'closed'
});
const script = require('./scripts/extract_captions.js')
await script({ context, github, glob })
- uses: actions/github-script@v6
name: Run it through whisper
# Only run this step if the issue is labeled with "whisper"
Expand All @@ -150,64 +38,8 @@ jobs:
REPLICATE_API_TOKEN: ${{ secrets.REPLICATE_API_TOKEN }}
with:
script: |
const child_process = require('child_process');
const fs = require('fs');
const body = context.payload.issue.body;
let match = body.match(/https?:\/\/[^\s]+/);
// Ignore github.com URLs
if (match && match[0].includes('https://github.com')) {
match = null;
}
if (!match) {
return;
}
const url = match[0];
// Create directory named after the issue number
const number = context.issue.number.toString();
child_process.spawnSync('mkdir', ['-p', `${number}/whisper`]);
// Use yt-dlp to extract the audio
console.log(child_process.spawnSync('yt-dlp', [
'-x', url, '--audio-format', 'mp3', '--output', `${number}/whisper/audio.%(ext)s`
], {
encoding: 'utf-8',
}).stdout);
console.log(child_process.spawnSync('find', [number], {
encoding: 'utf-8',
}).stdout);
// Run it through whisper
console.log(child_process.spawnSync('python', [
'transcribe_audio.py', `${number}/whisper/audio.mp3`, `${number}/whisper/transcription.json`
], {
encoding: 'utf-8',
}).stderr);
// Now delete the audio file so we don't check it into the repo
fs.unlinkSync(`${number}/whisper/audio.mp3`);
// Load JSON from transcription.json
const transcription = JSON.parse(fs.readFileSync(`${number}/whisper/transcription.json`));
let comment = '';
if (transcription.detected_language) {
comment += `Language: ${transcription.detected_language}\n\n`;
}
if (transcription.transcription) {
comment += 'Transcription: `' + transcription.transcription + '`\n\n';
}
if (transcription.translation) {
comment += 'Translation: `' + transcription.translation + '`\n\n';
}
// Post the output as a comment
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment
});
// Close the issue
github.rest.issues.update({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
state: 'closed'
});
const script = require('./scripts/transcribe_with_whisper.js')
script({ context, github })
- name: Commit and push
run: |-
git config user.name "Automated"
Expand Down
98 changes: 98 additions & 0 deletions scripts/extract_captions.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
const child_process = require('child_process');
const fs = require('fs');

function run(command, args, cwd) {
// Returns [status stdout, stderr]
const options = {
cwd: cwd,
encoding: 'utf-8',
};
const result = child_process.spawnSync(command, args, options);
return [result.status, result.stdout, result.stderr];
}

module.exports = async ({ context, github, glob }) => {
const body = context.payload.issue.body;
let match = body.match(/https?:\/\/[^\s]+/);
// Ignore github.com URLs
if (match && match[0].includes('https://github.com')) {
match = null;
}
if (!match) {
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: '🚫 No URL found in issue body.'
});
return;
}
const url = match[0];
// Create directory named after the issue number
const number = context.issue.number.toString();
child_process.spawnSync('mkdir', ['-p', number]);
// Create subdirectories subs, auto, whisper
child_process.spawnSync('mkdir', ['-p', `${number}/subs`]);
child_process.spawnSync('mkdir', ['-p', `${number}/auto`]);
let results = [];

results.push(run('yt-dlp', ['--all-subs', '--skip-download', '--sub-format', 'ttml/vtt/best', url], `${number}/subs`));
// Did that create any files?
const files = child_process.spawnSync('ls', [`${number}/subs`]);
if (files.stdout.toString().trim() === '') {
// Use --write-auto-sub
results.push(run('yt-dlp', ['--write-auto-sub', '--skip-download', '--sub-format', 'ttml/vtt/best', url], `${number}/auto`));
// Now delete all but the `ru` and `en` files
const autoFiles = fs.readdirSync(`${number}/auto`);
autoFiles.forEach(file => {
if (!file.includes('.ru.') && !file.includes('.en.')) {
fs.unlinkSync(`${number}/auto/${file}`);
}
});
}
const comment = results.map(result => {
return `\`\`\` + ${result[0]}\n${result[1]}\n${result[2]}\`\`\``;
}).join('\n');
// Post the output as a comment
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment,
});
// Generate a comment with the result from .ttml or .vtt files
for (const format of ['ttml', 'vtt']) {
const globber = await glob.create(`${number}/**/*.${format}`);
const foundFiles = await globber.glob();
if (foundFiles.length) {
// Pass it through XXXX-to-json
let args = [foundFiles[0]];
let command = null;
if (format === 'vtt') {
args.push('--dedupe');
command = 'webvtt-to-json';
} else {
command = 'ttml-to-json';
}
const captions = JSON.parse(
child_process.spawnSync(command, args, {
encoding: 'utf-8',
}
).stdout);
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: '```\n' + captions.map(caption => caption.lines.join('\n')).join('\n') + '\n```',
});
break;
}
}
// Close the issue
github.rest.issues.update({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
state: 'closed'
});
}
24 changes: 24 additions & 0 deletions scripts/save_metadata.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
const child_process = require('child_process');

module.exports = (context) => {
// yt-dlp 'https://www.youtube.com/watch?v=...' --skip-download --write-info-json
const body = context.payload.issue.body;
let match = body.match(/https?:\/\/[^\s]+/);
// Ignore github.com URLs
if (match && match[0].includes('https://github.com')) {
match = null;
}
if (!match) {
return;
}
const url = match[0];
const number = context.issue.number.toString();
child_process.spawnSync('mkdir', ['-p', `${number}/metadata-temp`]);
child_process.spawnSync('mkdir', ['-p', `${number}/metadata`]);
child_process.spawnSync('yt-dlp', [url, '--skip-download', '--write-info-json'], {
cwd: `${number}/metadata-temp`
});
// Rename to info.json
child_process.exec(`cat ${number}/metadata-temp/*.json | jq 'del(.automatic_captions)' -c > ${number}/metadata/info.json`);
child_process.exec(`rm ${number}/metadata-temp/*.json`);
}
61 changes: 61 additions & 0 deletions scripts/transcribe_with_whisper.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
const child_process = require('child_process');
const fs = require('fs');

module.exports = ({ context, github }) => {
const body = context.payload.issue.body;
let match = body.match(/https?:\/\/[^\s]+/);
// Ignore github.com URLs
if (match && match[0].includes('https://github.com')) {
match = null;
}
if (!match) {
return;
}
const url = match[0];
// Create directory named after the issue number
const number = context.issue.number.toString();
child_process.spawnSync('mkdir', ['-p', `${number}/whisper`]);
// Use yt-dlp to extract the audio
console.log(child_process.spawnSync('yt-dlp', [
'-x', url, '--audio-format', 'mp3', '--output', `${number}/whisper/audio.%(ext)s`
], {
encoding: 'utf-8',
}).stdout);
console.log(child_process.spawnSync('find', [number], {
encoding: 'utf-8',
}).stdout);
// Run it through whisper
console.log(child_process.spawnSync('python', [
'transcribe_audio.py', `${number}/whisper/audio.mp3`, `${number}/whisper/transcription.json`
], {
encoding: 'utf-8',
}).stderr);
// Now delete the audio file so we don't check it into the repo
fs.unlinkSync(`${number}/whisper/audio.mp3`);
// Load JSON from transcription.json
const transcription = JSON.parse(fs.readFileSync(`${number}/whisper/transcription.json`));
let comment = '';
if (transcription.detected_language) {
comment += `Language: ${transcription.detected_language}\n\n`;
}
if (transcription.transcription) {
comment += 'Transcription: `' + transcription.transcription + '`\n\n';
}
if (transcription.translation) {
comment += 'Translation: `' + transcription.translation + '`\n\n';
}
// Post the output as a comment
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment
});
// Close the issue
github.rest.issues.update({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
state: 'closed'
});
}