Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions build/i2c/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1381,6 +1381,8 @@ impl ConfigGenerator {
all.sort();

for (rail, (device, index)) in &all {
// if we update this code to be more clever than just to-lowercase'ing the rail names,
// you might need to go update the mapping in `validate-api`!
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this is now in the control-plane-agent build script...

write!(
&mut self.output,
r##"
Expand Down Expand Up @@ -1776,6 +1778,7 @@ pub fn codegen(settings: impl Into<CodegenSettings>) -> Result<()> {
Ok(())
}

#[derive(Debug, Clone)]
pub struct I2cDeviceDescription {
pub device: String,
pub description: String,
Expand Down
63 changes: 58 additions & 5 deletions drv/i2c-devices/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,22 +66,29 @@ macro_rules! pmbus_read {
}

macro_rules! pmbus_rail_read {
($device:expr, $rail:expr, $cmd:ident) => {
(@raw => $device:expr, $rail:expr, $cmd_code:expr, $len:expr) => {
$device
.write_read_reg::<u8, [u8; $cmd::CommandData::len()]>(
$cmd::CommandData::code(),
.write_read_reg::<u8, [u8; $len]>(
$cmd_code,
&[PAGE::CommandData::code(), $rail],
)
.map_err(|code| Error::BadRead {
cmd: $cmd::CommandData::code(),
cmd: $cmd_code,
code,
})
};

($device:expr, $rail:expr, $cmd:ident) => {{
let cmd_code = $cmd::CommandData::code();
const CMD_LEN: usize = $cmd::CommandData::len();

pmbus_rail_read!(@raw => $device, $rail, cmd_code, CMD_LEN)
.and_then(|rval| {
$cmd::CommandData::from_slice(&rval).ok_or(Error::BadData {
cmd: $cmd::CommandData::code(),
})
})
};
}};

($device:expr, $rail:expr, $dev:ident::$cmd:ident) => {{
use $dev::{PAGE, $cmd};
Expand Down Expand Up @@ -271,6 +278,52 @@ pub trait Validate<T: core::convert::Into<drv_i2c_api::ResponseCode>> {
}
}

// grumble grumble, copied from `gateway_messages::sp_to_mgs::PmbusStatus`
// grumble grumble, also basically the same as `ereports/src/pwr`
pub struct PmbusStatus {
pub status_word: u16,
pub status_vout: u8,
pub status_iout: u8,
pub status_temperature: u8,
pub status_cml: u8,
pub status_other: u8,
pub status_input: u8,
pub status_mfr_specific: u8,
pub status_fans_1_2: u8,
pub status_fans_3_4: u8,
}

pub enum PmbusStatusError {
BadRead { cmd: u8, code: ResponseCode },
BadData { cmd: u8, },
}

impl PmbusStatus {
pub fn read_from(dev: &I2cDevice, rail_idx: u8) -> Result<Self, ()> {
use pmbus::commands::*;
use PmbusStatusError as Error;

Ok(PmbusStatus {
status_word: pmbus_rail_read!(dev, rail_idx, STATUS_WORD).map_err(drop)?.0,
status_vout: pmbus_rail_read!(dev, rail_idx, STATUS_VOUT).map_err(drop)?.0,
status_iout: pmbus_rail_read!(dev, rail_idx, STATUS_IOUT).map_err(drop)?.0,
status_temperature: pmbus_rail_read!(dev, rail_idx, STATUS_TEMPERATURE).map_err(drop)?.0,
status_cml: pmbus_rail_read!(dev, rail_idx, STATUS_CML).map_err(drop)?.0,
status_other: pmbus_rail_read!(dev, rail_idx, STATUS_OTHER).map_err(drop)?.0,
status_input: pmbus_rail_read!(dev, rail_idx, STATUS_INPUT).map_err(drop)?.0,
status_fans_1_2: pmbus_rail_read!(dev, rail_idx, STATUS_FANS_1_2).map_err(drop)?.0,
status_fans_3_4: pmbus_rail_read!(dev, rail_idx, STATUS_FANS_3_4).map_err(drop)?.0,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm...I'd be interested to know what happens if we try to read either STATUS_FANS register from a device that doesn't have fans, or if we try to read STATUS_FANS_3_4 from a device that only has 1 or 2 fans. Do we get back a read that's all zeroes, or does the device return a NAK or something?

§17.10 and §17.10 in the PMBus Specification, Part II, Version 1.3.1 doesn't explicitly say what a device without fans is supposed to do, so, if I had to guess, it's probably left up to the manufacturer...

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Attempting to find out what happens when one does this experimentally, and I have discovered that, using the humility pmbus command to ask a RAA229620A for its STATUS_FANS_1_2, it just...prints nothing:

eliza@alfred ~ $ pfexec /staff/eliza/humility -t cosmo-hubris-sp pmbus -r VDDCR_CPU0_A0 -C STATUS_FANS_1_2
humility: attached to 1fc9:0143:XJACFXSEKTQJS via CMSIS-DAP
eliza@alfred ~ $

I'm not actually sure what "nothing" means here!

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The verbose mode also makes it say "nothing":

eliza@alfred ~ $ pfexec /staff/eliza/humility -t cosmo-hubris-sp pmbus -r VDDCR_CPU0_A0 -C STATUS_FANS_1_2 --verbose
humility: attached to 1fc9:0143:XJACFXSEKTQJS via CMSIS-DAP
eliza@alfred ~ $

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OH, you have to pass the --errors flag to humility pmbus to make it show errors. So the RAA229620A returns Noregister when you ask for its STATUS_FANS_1_2 or STATUS_FANS_3_4:

eliza@alfred ~ $ pfexec /staff/eliza/humility -t cosmo-hubris-sp pmbus -r VDDCR_CPU0_A0 -C STATUS_FANS_1_2,STATUS_FANS_3_4 --verbose --errors
humility: attached to 1fc9:0143:XJACFXSEKTQJS via CMSIS-DAP
0x81 STATUS_FANS_1_2           Err(NoRegister)
0x82 STATUS_FANS_3_4           Err(NoRegister)
eliza@alfred ~ $

The ISL68224, ADM127X, TPS546B24A also do the same when asked for their STATUS_FANS registers, while the BMR491 appears to...still do something that results in Humility printing nothing?

eliza@alfred ~ $ pfexec /staff/eliza/humility -t cosmo-hubris-sp pmbus -r VDDCR_CPU0_A0 -C STATUS_FANS_1_2 --verbose --errors
humility: attached to 1fc9:0143:XJACFXSEKTQJS via CMSIS-DAP
0x81 STATUS_FANS_1_2           Err(NoRegister)
eliza@alfred ~ $ pfexec /staff/eliza/humility -t cosmo-hubris-sp pmbus -r VDDCR_CPU0_A0 -C STATUS_FANS_1_2,STATUS_FANS_3_4 --verbose --errors
humility: attached to 1fc9:0143:XJACFXSEKTQJS via CMSIS-DAP
0x81 STATUS_FANS_1_2           Err(NoRegister)
0x82 STATUS_FANS_3_4           Err(NoRegister)
eliza@alfred ~ $ pfexec /staff/eliza/humility -t cosmo-hubris-sp pmbus -r V1P8_SP5_A0 -C STATUS_FANS_1_2,STATUS_FANS_3_4 --verbose --errors
humility: attached to 1fc9:0143:XJACFXSEKTQJS via CMSIS-DAP
humility pmbus failed: rail V1P8_SP5_A0 not found
eliza@alfred ~ $ pfexec /staff/eliza/humility -t cosmo-hubris-sp pmbus -r V1P8_SP5_A1 -C STATUS_FANS_1_2,STATUS_FANS_3_4 --verbose --errors
humility: attached to 1fc9:0143:XJACFXSEKTQJS via CMSIS-DAP
0x81 STATUS_FANS_1_2           Err(NoRegister)
0x82 STATUS_FANS_3_4           Err(NoRegister)
eliza@alfred ~ $ pfexec /staff/eliza/humility -t cosmo-hubris-sp pmbus -r V54P5_IBC_A3 -C STATUS_FANS_1_2,STATUS_FANS_3_4 --verbose --errors
humility: attached to 1fc9:0143:XJACFXSEKTQJS via CMSIS-DAP
0x81 STATUS_FANS_1_2           Err(NoRegister)
0x82 STATUS_FANS_3_4           Err(NoRegister)
eliza@alfred ~ $ pfexec /staff/eliza/humility -t cosmo-hubris-sp pmbus -r V3P3_SP_A2 -C STATUS_FANS_1_2,STATUS_FANS_3_4 --verbose --errors
humility: attached to 1fc9:0143:XJACFXSEKTQJS via CMSIS-DAP
0x81 STATUS_FANS_1_2           Err(NoRegister)
0x82 STATUS_FANS_3_4           Err(NoRegister)
eliza@alfred ~ $ pfexec /staff/eliza/humility -t cosmo-hubris-sp pmbus -r V12_SYS_A2 -C STATUS_FANS_1_2,STATUS_FANS_3_4 --verbose --errors
humility: attached to 1fc9:0143:XJACFXSEKTQJS via CMSIS-DAP
eliza@alfred ~ $

This is sort of what I had figured, and suggests that we need to either have some additional configuration option that says whether or not to read the fans, and/or (and this would be my preference) change PmbusStatus::read_from to not bail out if any read returns an error, as I suggested in #2538 (comment).


// Unfortunately, STATUS_MFR_SPECIFIC *is* defined in the pmbus crate, but doesn't have a
// "structured" representation, so instead use a raw representation. It *could* be argued
// that since we're not actually peeking at any of the introspection stuff, we could do
// the same for all the items above, and save ourselved a little indirection, but for now
// just hole-punch the minimum amount necessary
status_mfr_specific: pmbus_rail_read!(@raw => dev, rail_idx, CommandCode::STATUS_MFR_SPECIFIC as u8, 1).map_err(drop)?[0],
})
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't love that this will bail out if any read returns an error. I think it would be much better to change the PmbusStatus type so that each register's field was a Result<u8, i2c::ResponseCode> or something. That way, we can always communicate whatever registers we were able to successfully read to upstack software even if we encounter some weird I2C bus weather or something. Also, if (as I mentioned in my other comment) a device without fans does choose to NAK one of the STATUS_FANS registers, that doesn't result in us being permanently unable to read its status.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do! I wondered whether to make these Options or Results, and I should definitely do one of those.

I wonder if we could capture "this device supports these statuses" in the pmbus crate and maybe codegen based on this so we don't spend time doing I2C we know is going to be NAK'd, but this could also be a good follow-up.

Since we're not using device-specific methods here, I would imagine we would want to pass in some kind of bit-flag of which ones to check or not, and just have a None or Err(Unsupported).

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we could capture "this device supports these statuses" in the pmbus crate and maybe codegen based on this so we don't spend time doing I2C we know is going to be NAK'd, but this could also be a good follow-up.

Yeah, we could either do this in the pmbus crate or in the codegen itself. Since we have a string saying which device it is, the codegen could have its own understanding of which registers to skip for certain devices. It probably should fall back to querying all registers for device strings it doesn't recognize, maybe along with a build warning.

I do worry slightly about a device firmware revision changing its behavior to something that doesn't match the codegen's understanding, but that feels pretty unlikely. For instance, an RAA229620A VRM is not going to suddenly grow fans if its firmware is updated!


Incidentally, there is another option, which is the PMBus QUERY command (§11.13 PMBus Specification, Part II, Version 1.3.1). This allows you to ask a device whether it supports a particular PMBus command, such as STATUS_FANS_1_2 and get back a response that tells you whether the device supports it. I'm not sure how useful this is to us here, though --- if the goal is mostly to avoid I2C traffic for unsupported registers, using QUERY to ask "hey, do you have STATUS_FANS_1_2? okay, do you have STATUS_FANS_3_4?" results in just as much I2C traffic if the device doesn't have those registers, and more if it does. We could probably avoid that by QUERYing once and caching the result somewhere, but the current code is not structured in a way that gives us a place to stash that, so it would be a bit more complex.

At the end of the day, I just don't really think that's particularly worthwhile: since we know all the devices on the board at build time, we can just decide which registers to read when we do the code generation, and we don't have to dynamically detect capabilities. I think that QUERY is more intended for the use case of commodity motherboards which might end up talking to any arbitrary commodity PSUs, where the firmware doesn't know what PMBus commands are supported until the system boots up, and therefore must dynamically detect them. But, I felt like it was worth mentioning, if only to write down why we probably don't need to use it.

}
}

pub mod adm127x;
pub mod adt7420;
pub mod at24csw080;
Expand Down
58 changes: 55 additions & 3 deletions task/validate-api/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
use std::io::Write;

fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
if let Err(e) = build_i2c::codegen(build_i2c::Disposition::Devices) {
println!("cargo::error=failed to generate I2C devices: {e}");
std::process::exit(1);
}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Soooo, I realize that in my VPD PR, I added all the code generation to task-validate-api, but I'm not sure if we need to do that here. In the VPD branch (which I need to resurrect), it was necessary to add the VPD-reading code to task-validate-api's codegen, because validate generates the list of DeviceDescriptions, and I wanted to add device VPD to the component_details MGS API, which is driven by the list of DeviceDescriptions from task-validate-api.

Since we're looking up devices for reading PMBus status using a new lookup table of rail names to devices, I think there isn't actually any reason that that needs to go in task-validate-api. I think it might be nicer to just put that in control-plane-agent's build script, or make it a Disposition for build_i2c::codegen or something. That way, the validate API crate need not actually inflict all the I2C codegen on every task that depends on it. What do you think?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that sounds reasonable! I put it in task-validate-api because I was copying your VPD PR, but I admit to wondering "why does it need to be here/like this?".

I'll move it over to control plane agent, or as a new codegen path.


write_pub_device_descriptions()?;

idol::client::build_client_stub(
Expand All @@ -21,7 +26,7 @@ fn write_pub_device_descriptions() -> anyhow::Result<()> {
let out_dir = std::env::var("OUT_DIR")?;
let dest_path =
std::path::Path::new(&out_dir).join("device_descriptions.rs");
let file = std::fs::File::create(dest_path)?;
let file = std::fs::File::create(&dest_path)?;
let mut file = std::io::BufWriter::new(file);

writeln!(
Expand Down Expand Up @@ -57,13 +62,14 @@ fn write_pub_device_descriptions() -> anyhow::Result<()> {
// key.
//
let mut id2idx = std::collections::BTreeMap::new();
let mut pmbus_rail_names = std::collections::BTreeSet::new();

for (idx, dev) in devices.into_iter().enumerate() {
for (idx, dev) in devices.iter().cloned().enumerate() {
let is_pmbus = dev.is_pmbus();
writeln!(file, " DeviceDescription {{")?;
writeln!(file, " device: {:?},", dev.device)?;
writeln!(file, " description: {:?},", dev.description)?;
if let Some(id) = dev.device_id {
if let Some(ref id) = dev.device_id {
if let Ok(component) = SpComponent::try_from(id.as_ref()) {
writeln!(file, " id: {:?},", component.id)?;
if id2idx.insert(component.id, idx).is_some() {
Expand All @@ -77,6 +83,15 @@ fn write_pub_device_descriptions() -> anyhow::Result<()> {
);
ids_too_long += 1;
}

if let Some(ref pmbus) = dev.pmbus {
for rail in pmbus.rails.iter() {
// Returns "is unique", unlike `BTreeMap::insert().is_some()`!
if !pmbus_rail_names.insert(rail.name.clone()) {
panic!("cargo::warn=dupe: {:?}, {:?}", rail.name, dev);
}
}
}
} else {
println!(
"cargo::error=device {:?} ({:?}) hath no device ID (refdes)",
Expand Down Expand Up @@ -114,6 +129,39 @@ fn write_pub_device_descriptions() -> anyhow::Result<()> {
}
writeln!(file, "];")?;

let max_len = pmbus_rail_names.iter().map(String::len).max();
if let Some(_len) = max_len {

// TODO: do we need fixed-length bstrings? If we want to binary search, we probably want them to
// be truncated to some maximum length, so we either need to define the max length at the
// protocol level so mgs knows how long of a string to send us, or we could instead trim the
// trailing nulls and search by that instead.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i suspect the lookup code is going to end up looking something like the code in control-plane-agent for looking up a component by ID:

impl TryFrom<&'_ SpComponent> for Index {
type Error = SpError;
fn try_from(component: &'_ SpComponent) -> Result<Self, Self::Error> {
if let Ok(entry_idx) = task_validate_api::DEVICE_INDICES_BY_SORTED_ID
.binary_search_by_key(&component.id, |&(id, _)| id)
{
let &(_, index) = task_validate_api::DEVICE_INDICES_BY_SORTED_ID
.get(entry_idx)
.unwrap_lite();
return Ok(Self::ValidateDevice(index));
}
for (i, d) in OUR_DEVICES.iter().enumerate() {
if *component == d.component {
return Ok(Self::OurDevice(i));
}
}
Err(SpError::RequestUnsupportedForComponent)
}
}

as for maximum length, i think that the wire message will certainly have to have one, a la SpComponent in gateway-messages: https://github.com/oxidecomputer/management-gateway-service/blob/6c0aca2545a73fd75536e149d29faa7108be5862/gateway-messages/src/lib.rs#L284-L294

I really don't love the null-padding, but even if we were to adopt an encoding that let us not do that part, we would still need to have some max length somewhere so we can limit the buffer size...

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think the distinction that I wanted to make (or at least still need to shake out) is that just because it needs to be zero-padded on the wire, that doesn't necessarily mean that internally we ALSO need to zero-pad it.

When receiving the command over the wire, we could first trim any trailing nulls, and then use the slice locally instead. That being said, if I bump into anywhere else where we need fixed length arrays, I'll probably revert back to zero-padding in the codegen.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think avoiding zero-padding here is probably nicer if you can manage it --- it should save us a bit of flash in the lookup table even if it doesn't save us the stack space when we're passing around the wire message.

//
// writeln!(file, "pub const MAX_PMBUS_RAIL_NAME: usize = {len};")?;
// writeln!(file, "pub const PMBUS_RAIL_TO_I2C_DEVICE_MAP: [([u8; MAX_PMBUS_RAIL_NAME], fn(TaskId) -> (drv_i2c_api::I2cDevice, u8)); {}] = [", pmbus_rail_names.len())?;
// for rail in pmbus_rail_names.iter() {
// write!(file, " (*b\"{rail}")?;
// for _ in 0..(len.checked_sub(rail.len()).unwrap()) {
// write!(file, "\\0")?;
// }
// write!(file, "\", ")?;
// write!(file, "crate::i2c_config::pmbus::{}", rail.to_lowercase())?;
// writeln!(file, "),")?;
// }
// writeln!(file, "];")?;

// Assuming we are going the trimmed route...
writeln!(file)?;
writeln!(file, "pub const PMBUS_RAIL_TO_I2C_DEVICE_MAP: [(&[u8], fn(TaskId) -> (drv_i2c_api::I2cDevice, u8)); {}] = [", pmbus_rail_names.len())?;
for rail in pmbus_rail_names.iter() {
write!(file, " (b\"{rail}\", ")?;
// build_i2c *also* only to-lowercases the rail names to make functions
write!(file, "crate::i2c_config::pmbus::{}", rail.to_lowercase())?;
writeln!(file, "),")?;
}
writeln!(file, "];")?;
}

file.flush()?;

anyhow::ensure!(missing_ids == 0, "{missing_ids} devices have no ID!");
Expand All @@ -129,5 +177,9 @@ fn write_pub_device_descriptions() -> anyhow::Result<()> {
SpComponent::MAX_ID_LENGTH,
);


// panic!("{}", dest_path.display());
// panic!("{max_len:?} {:#?}", pmbus_rail_names);

Ok(())
}
2 changes: 2 additions & 0 deletions task/validate-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,5 @@ pub struct DeviceDescription {
}

include!(concat!(env!("OUT_DIR"), "/device_descriptions.rs"));

include!(concat!(env!("OUT_DIR"), "/i2c_config.rs"));
Loading