Skip to content

Commit

Permalink
Add coreos-installer install --fetch-retries
Browse files Browse the repository at this point in the history
We're hitting issues in RHCOS right now where it's possible that
`coreos-installer.service` is racing with networking being fully up,
even though we're ordered after `network-online.target` and
`systemd-resolved.service` (though RHCOS doesn't use the latter).

The issue may not be a race in the end, but misconfigured networking.
But anyway, we really should be retrying fetches.

I gated this behind a switch instead of doing it by default for all
fetches, because e.g. interactively I think it makes more sense not to
retry. (And similarly for other commands like `download` and
`list-stream`).

Related: https://bugzilla.redhat.com/show_bug.cgi?id=1974411
Related: https://bugzilla.redhat.com/show_bug.cgi?id=1967483

Closes: coreos#283
  • Loading branch information
jlebon committed Jul 28, 2021
1 parent d57664a commit 955980b
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 31 deletions.
1 change: 1 addition & 0 deletions docs/cmd/download.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ Download a CoreOS image
| **--decompress**, **-d** | Decompress image and don't save signature |
| **--insecure** | Skip signature verification |
| **--stream-base-url** *URL* | Base URL for Fedora CoreOS stream metadata |
| **--fetch-retries** *N* | Fetch retries, or string "infinite" |
1 change: 1 addition & 0 deletions docs/cmd/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,4 @@ Install Fedora CoreOS or RHEL CoreOS
| **--stream-base-url** *URL* | Base URL for Fedora CoreOS stream metadata |
| **--architecture** *name* | Target CPU architecture [default: x86_64] |
| **--preserve-on-error** | Don't clear partition table on error |
| **--fetch-retries** *N* | Fetch retries, or string "infinite" |
3 changes: 3 additions & 0 deletions scripts/coreos-installer-service
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ if karg_bool coreos.inst.insecure; then
args+=("--insecure")
fi

# Always retry HTTP requests; we've got nothing to lose since we fail anyway.
args+=("--fetch-retries" "infinite")

# Ensure device nodes have been created
udevadm settle

Expand Down
59 changes: 56 additions & 3 deletions src/cmdline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ pub struct InstallConfig {
pub preserve_on_error: bool,
pub network_config: Option<String>,
pub save_partitions: Vec<PartitionFilter>,
pub fetch_retries: FetchRetries,
}

#[derive(Debug, Clone, Copy)]
pub enum FetchRetries {
Infinite,
Finite(NonZeroU32),
None,
}

#[derive(Debug, PartialEq, Eq)]
Expand Down Expand Up @@ -334,6 +342,13 @@ pub fn parse_args() -> Result<Config> {
.long("preserve-on-error")
.help("Don't clear partition table on error"),
)
.arg(
Arg::with_name("fetch-retries")
.long("fetch-retries")
.value_name("N")
.help("Fetch retries, or string \"infinite\"")
.takes_value(true),
)
// positional args
.arg(
Arg::with_name("device")
Expand Down Expand Up @@ -414,6 +429,13 @@ pub fn parse_args() -> Result<Config> {
.value_name("URL")
.help("Base URL for Fedora CoreOS stream metadata")
.takes_value(true),
)
.arg(
Arg::with_name("fetch-retries")
.long("fetch-retries")
.value_name("N")
.help("Fetch retries, or string \"infinite\"")
.takes_value(true),
),
)
.subcommand(
Expand Down Expand Up @@ -855,6 +877,20 @@ fn parse_install(matches: &ArgMatches) -> Result<Config> {
.value_of("architecture")
.expect("architecture missing");

let fetch_retries: FetchRetries = matches
.value_of("fetch-retries")
.map(|s| match s {
"infinite" => Ok(FetchRetries::Infinite),
num => num.parse::<u32>().map(|num| {
NonZeroU32::new(num)
.map(FetchRetries::Finite)
.unwrap_or(FetchRetries::None)
}),
})
.transpose()
.context("parsing --fetch-retries")?
.unwrap_or(FetchRetries::None);

// Uninitialized ECKD DASD's blocksize is 512, but after formatting
// it changes to the recommended 4096
// https://bugzilla.redhat.com/show_bug.cgi?id=1905159
Expand All @@ -878,7 +914,7 @@ fn parse_install(matches: &ArgMatches) -> Result<Config> {
} else if matches.is_present("image-url") {
let image_url = Url::parse(matches.value_of("image-url").expect("image-url missing"))
.context("parsing image URL")?;
Box::new(UrlLocation::new(&image_url))
Box::new(UrlLocation::new(&image_url, fetch_retries))
} else if matches.is_present("offline") {
match OsmetLocation::new(architecture, sector_size)? {
Some(osmet) => Box::new(osmet),
Expand Down Expand Up @@ -921,6 +957,7 @@ fn parse_install(matches: &ArgMatches) -> Result<Config> {
"metal",
format,
base_url.as_ref(),
fetch_retries,
)?)
}
};
Expand All @@ -944,7 +981,7 @@ fn parse_install(matches: &ArgMatches) -> Result<Config> {
} else if !url.starts_with("https://") {
bail!("unknown protocol for URL '{}'", url);
}
download_to_tempfile(url)
download_to_tempfile(url, fetch_retries)
.with_context(|| format!("downloading source Ignition config {}", url))
}).transpose()?
} else {
Expand All @@ -968,6 +1005,7 @@ fn parse_install(matches: &ArgMatches) -> Result<Config> {
device,
location,
ignition,
fetch_retries,
ignition_hash: matches
.value_of("ignition-hash")
.map(IgnitionHash::try_parse)
Expand Down Expand Up @@ -1051,10 +1089,24 @@ fn parse_download(matches: &ArgMatches) -> Result<Config> {
// Build image location. Ideally we'd use conflicts_with (and an
// ArgGroup for streams), but that doesn't play well with default
// arguments, so we manually prioritize modes.
let fetch_retries: FetchRetries = matches
.value_of("fetch-retries")
.map(|s| match s {
"infinite" => Ok(FetchRetries::Infinite),
num => num.parse::<u32>().map(|num| {
NonZeroU32::new(num)
.map(FetchRetries::Finite)
.unwrap_or(FetchRetries::None)
}),
})
.transpose()
.context("parsing --fetch-retries")?
.unwrap_or(FetchRetries::None);

let location: Box<dyn ImageLocation> = if matches.is_present("image-url") {
let image_url = Url::parse(matches.value_of("image-url").expect("image-url missing"))
.context("parsing image URL")?;
Box::new(UrlLocation::new(&image_url))
Box::new(UrlLocation::new(&image_url, fetch_retries))
} else {
let base_url = if let Some(stream_base_url) = matches.value_of("stream-base-url") {
Some(Url::parse(stream_base_url).context("parsing stream base URL")?)
Expand All @@ -1069,6 +1121,7 @@ fn parse_download(matches: &ArgMatches) -> Result<Config> {
matches.value_of("platform").expect("platform missing"),
matches.value_of("format").expect("format missing"),
base_url.as_ref(),
fetch_retries,
)?)
};

Expand Down
9 changes: 2 additions & 7 deletions src/download.rs
Original file line number Diff line number Diff line change
Expand Up @@ -327,16 +327,11 @@ pub fn image_copy_default(
Ok(())
}

pub fn download_to_tempfile(url: &str) -> Result<File> {
pub fn download_to_tempfile(url: &str, retries: FetchRetries) -> Result<File> {
let mut f = tempfile::tempfile()?;

let client = new_http_client()?;
let mut resp = client
.get(url)
.send()
.with_context(|| format!("sending request for '{}'", url))?
.error_for_status()
.with_context(|| format!("fetching '{}'", url))?;
let mut resp = http_get(client, url, retries)?;

let mut writer = BufWriter::with_capacity(BUFFER_SIZE, &mut f);
copy(
Expand Down
88 changes: 67 additions & 21 deletions src/source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use std::fmt::{Display, Formatter};
use std::fs::OpenOptions;
use std::io::{Read, Seek, SeekFrom};
use std::path::{Path, PathBuf};
use std::thread::sleep;
use std::time::Duration;

use crate::cmdline::*;
Expand Down Expand Up @@ -65,6 +66,7 @@ pub struct UrlLocation {
image_url: Url,
sig_url: Url,
artifact_type: String,
retries: FetchRetries,
}

// Remote image source specified by Fedora CoreOS stream metadata
Expand All @@ -76,6 +78,7 @@ pub struct StreamLocation {
architecture: String,
platform: String,
format: String,
retries: FetchRetries,
}

pub struct ImageSource {
Expand Down Expand Up @@ -150,28 +153,25 @@ impl ImageLocation for FileLocation {
}

impl UrlLocation {
pub fn new(url: &Url) -> Self {
pub fn new(url: &Url, retries: FetchRetries) -> Self {
let mut sig_url = url.clone();
sig_url.set_path(&format!("{}.sig", sig_url.path()));
Self::new_with_sig_and_type(url, &sig_url, "disk")
Self::new_full(url, &sig_url, "disk", retries)
}

fn new_with_sig_and_type(url: &Url, sig_url: &Url, artifact_type: &str) -> Self {
fn new_full(url: &Url, sig_url: &Url, artifact_type: &str, retries: FetchRetries) -> Self {
Self {
image_url: url.clone(),
sig_url: sig_url.clone(),
artifact_type: artifact_type.to_string(),
retries,
}
}

/// Fetch signature content from URL.
fn fetch_signature(sig_url: &Url) -> Result<Vec<u8>> {
fn fetch_signature(&self) -> Result<Vec<u8>> {
let client = new_http_client()?;
let mut resp = client
.get(sig_url.clone())
.send()
.context("sending signature request")?
.error_for_status()
let mut resp = http_get(client, self.sig_url.as_str(), self.retries)
.context("fetching signature URL")?;

let mut sig_bytes = Vec::new();
Expand All @@ -193,15 +193,14 @@ impl Display for UrlLocation {

impl ImageLocation for UrlLocation {
fn sources(&self) -> Result<Vec<ImageSource>> {
let signature = Self::fetch_signature(&self.sig_url)
let signature = self
.fetch_signature()
.map_err(|e| eprintln!("Failed to fetch signature: {}", e))
.ok();

// start fetch, get length
let client = new_http_client()?;
let resp = client
.get(self.image_url.clone())
.send()
let resp = http_get(client, self.image_url.as_str(), self.retries)
.context("fetching image URL")?;
match resp.status() {
StatusCode::OK => (),
Expand Down Expand Up @@ -234,6 +233,7 @@ impl StreamLocation {
platform: &str,
format: &str,
base_url: Option<&Url>,
retries: FetchRetries,
) -> Result<Self> {
Ok(Self {
stream_base_url: base_url.cloned(),
Expand All @@ -242,6 +242,7 @@ impl StreamLocation {
architecture: architecture.to_string(),
platform: platform.to_string(),
format: format.to_string(),
retries,
})
}
}
Expand All @@ -268,7 +269,7 @@ impl ImageLocation for StreamLocation {
fn sources(&self) -> Result<Vec<ImageSource>> {
// fetch and parse stream metadata
let client = new_http_client()?;
let stream = fetch_stream(client, &self.stream_url)?;
let stream = fetch_stream(client, &self.stream_url, self.retries)?;

// descend it
let artifacts = stream
Expand All @@ -293,7 +294,7 @@ impl ImageLocation for StreamLocation {
let signature_url = Url::parse(&artifact.signature)
.context("parsing signature URL from stream metadata")?;
let mut artifact_sources =
UrlLocation::new_with_sig_and_type(&artifact_url, &signature_url, &artifact_type)
UrlLocation::new_full(&artifact_url, &signature_url, &artifact_type, self.retries)
.sources()?;
sources.append(&mut artifact_sources);
}
Expand Down Expand Up @@ -385,7 +386,7 @@ pub fn list_stream(config: &ListStreamConfig) -> Result<()> {
// fetch stream metadata
let client = new_http_client()?;
let stream_url = build_stream_url(&config.stream, config.stream_base_url.as_ref())?;
let stream = fetch_stream(client, &stream_url)?;
let stream = fetch_stream(client, &stream_url, FetchRetries::None)?;

// walk formats
let mut rows: Vec<Row> = Vec::new();
Expand Down Expand Up @@ -439,12 +440,9 @@ fn build_stream_url(stream: &str, base_url: Option<&Url>) -> Result<Url> {
}

/// Fetch and parse stream metadata.
fn fetch_stream(client: blocking::Client, url: &Url) -> Result<Stream> {
fn fetch_stream(client: blocking::Client, url: &Url, retries: FetchRetries) -> Result<Stream> {
// fetch stream metadata
let resp = client
.get(url.clone())
.send()
.context("fetching stream metadata")?;
let resp = http_get(client, url.as_str(), retries).context("fetching stream metadata")?;
match resp.status() {
StatusCode::OK => (),
s => bail!("stream metadata fetch from {} failed: {}", url, s),
Expand All @@ -463,6 +461,54 @@ pub fn new_http_client() -> Result<blocking::Client> {
.context("building HTTP client")
}

/// Wrapper around Client::get() with error handling based on HTTP return code and optionally basic
/// exponential backoff retries for transient errors.
pub fn http_get(
client: blocking::Client,
url: &str,
retries: FetchRetries,
) -> Result<blocking::Response> {
// this matches `curl --retry` semantics -- see list in `curl(1)`
const RETRY_STATUS_CODES: [u16; 6] = [408, 429, 500, 502, 503, 504];

let mut delay = 1;
let (infinite, mut tries) = match retries {
FetchRetries::Infinite => (true, 0),
FetchRetries::Finite(n) => (false, n.get() + 1),
FetchRetries::None => (false, 1),
};

loop {
let err: anyhow::Error = match client.get(url).send() {
Err(err) => err.into(),
Ok(resp) => match resp.status().as_u16() {
code if RETRY_STATUS_CODES.contains(&code) => anyhow!(
"HTTP {} {}",
code,
resp.status().canonical_reason().unwrap_or("")
),
_ => {
return resp
.error_for_status()
.with_context(|| format!("fetching '{}'", url));
}
},
};

if !infinite {
tries -= 1;
if tries == 0 {
return Err(err).with_context(|| format!("fetching '{}'", url));
}
}

eprintln!("Error fetching '{}': {}", url, err);
eprintln!("Sleeping {}s and retrying...", delay);
sleep(Duration::from_secs(delay));
delay = std::cmp::min(delay * 2, 10 * 60); // cap to 10 mins; matches curl
}
}

#[derive(Debug, Deserialize)]
struct Stream {
architectures: HashMap<String, Arch>,
Expand Down

0 comments on commit 955980b

Please sign in to comment.