1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
|
/* SPDX-License-Identifier: MIT */
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef __RAS_EEPROM_H__
#define __RAS_EEPROM_H__
#include "ras_sys.h"
#define RAS_TABLE_VER_V1 0x00010000
#define RAS_TABLE_VER_V2_1 0x00021000
#define RAS_TABLE_VER_V3 0x00030000
#define NONSTOP_OVER_THRESHOLD -2
#define WARN_NONSTOP_OVER_THRESHOLD -1
#define DISABLE_RETIRE_PAGE 0
/*
* Bad address pfn : eeprom_umc_record.retired_row_pfn[39:0],
* nps mode: eeprom_umc_record.retired_row_pfn[47:40]
*/
#define EEPROM_RECORD_UMC_ADDR_MASK 0xFFFFFFFFFFULL
#define EEPROM_RECORD_UMC_NPS_MASK 0xFF0000000000ULL
#define EEPROM_RECORD_UMC_NPS_SHIFT 40
#define EEPROM_RECORD_UMC_NPS_MODE(RECORD) \
(((RECORD)->retired_row_pfn & EEPROM_RECORD_UMC_NPS_MASK) >> \
EEPROM_RECORD_UMC_NPS_SHIFT)
#define EEPROM_RECORD_UMC_ADDR_PFN(RECORD) \
((RECORD)->retired_row_pfn & EEPROM_RECORD_UMC_ADDR_MASK)
#define EEPROM_RECORD_SETUP_UMC_ADDR_AND_NPS(RECORD, ADDR, NPS) \
do { \
uint64_t tmp = (NPS); \
tmp = ((tmp << EEPROM_RECORD_UMC_NPS_SHIFT) & EEPROM_RECORD_UMC_NPS_MASK); \
tmp |= (ADDR) & EEPROM_RECORD_UMC_ADDR_MASK; \
(RECORD)->retired_row_pfn = tmp; \
} while (0)
enum ras_gpu_health_status {
RAS_GPU_HEALTH_NONE = 0,
RAS_GPU_HEALTH_USABLE = 1,
RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
RAS_GPU_IN_BAD_STATUS = 3,
};
enum ras_eeprom_err_type {
RAS_EEPROM_ERR_NA,
RAS_EEPROM_ERR_RECOVERABLE,
RAS_EEPROM_ERR_NON_RECOVERABLE,
RAS_EEPROM_ERR_COUNT,
};
struct ras_eeprom_table_header {
uint32_t header;
uint32_t version;
uint32_t first_rec_offset;
uint32_t tbl_size;
uint32_t checksum;
} __packed;
struct ras_eeprom_table_ras_info {
u8 rma_status;
u8 health_percent;
u16 ecc_page_threshold;
u32 padding[64 - 1];
} __packed;
struct ras_eeprom_control {
struct ras_eeprom_table_header tbl_hdr;
struct ras_eeprom_table_ras_info tbl_rai;
/* record threshold */
int record_threshold_config;
uint32_t record_threshold_count;
bool update_channel_flag;
const struct ras_eeprom_sys_func *sys_func;
void *i2c_adapter;
u32 i2c_port;
u16 max_read_len;
u16 max_write_len;
/* Base I2C EEPPROM 19-bit memory address,
* where the table is located. For more information,
* see top of amdgpu_eeprom.c.
*/
u32 i2c_address;
/* The byte offset off of @i2c_address
* where the table header is found,
* and where the records start--always
* right after the header.
*/
u32 ras_header_offset;
u32 ras_info_offset;
u32 ras_record_offset;
/* Number of records in the table.
*/
u32 ras_num_recs;
/* First record index to read, 0-based.
* Range is [0, num_recs-1]. This is
* an absolute index, starting right after
* the table header.
*/
u32 ras_fri;
/* Maximum possible number of records
* we could store, i.e. the maximum capacity
* of the table.
*/
u32 ras_max_record_count;
/* Protect table access via this mutex.
*/
struct mutex ras_tbl_mutex;
/* Record channel info which occurred bad pages
*/
u32 bad_channel_bitmap;
};
/*
* Represents single table record. Packed to be easily serialized into byte
* stream.
*/
struct eeprom_umc_record {
union {
uint64_t address;
uint64_t offset;
};
uint64_t retired_row_pfn;
uint64_t ts;
enum ras_eeprom_err_type err_type;
union {
unsigned char bank;
unsigned char cu;
};
unsigned char mem_channel;
unsigned char mcumc_id;
/* The following variables will not be saved to eeprom.
*/
uint64_t cur_nps_retired_row_pfn;
uint32_t cur_nps_bank;
uint32_t cur_nps;
};
struct ras_core_context;
int ras_eeprom_hw_init(struct ras_core_context *ras_core);
int ras_eeprom_hw_fini(struct ras_core_context *ras_core);
int ras_eeprom_reset_table(struct ras_core_context *ras_core);
bool ras_eeprom_check_safety_watermark(struct ras_core_context *ras_core);
int ras_eeprom_read(struct ras_core_context *ras_core,
struct eeprom_umc_record *records, const u32 num);
int ras_eeprom_append(struct ras_core_context *ras_core,
struct eeprom_umc_record *records, const u32 num);
uint32_t ras_eeprom_max_record_count(struct ras_core_context *ras_core);
uint32_t ras_eeprom_get_record_count(struct ras_core_context *ras_core);
void ras_eeprom_sync_info(struct ras_core_context *ras_core);
int ras_eeprom_check_storage_status(struct ras_core_context *ras_core);
enum ras_gpu_health_status
ras_eeprom_check_gpu_status(struct ras_core_context *ras_core);
#endif
|