diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 4a46b27ee9a0..1b82f7e87393 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -167,6 +167,7 @@ #define FLUSH_RETRY_TIMEOUT 2 #define FLUSH_GIVEUP 3 #define FLUSH_COMPLETE 4 +#define FLUSH_RETRY_BUSYBUG 5 /* * tuning the action when the numalink network is extremely delayed @@ -463,7 +464,6 @@ struct bau_pq_entry { struct msg_desc { struct bau_pq_entry *msg; int msg_slot; - int swack_slot; struct bau_pq_entry *queue_first; struct bau_pq_entry *queue_last; }; @@ -517,6 +517,9 @@ struct ptc_stats { unsigned long s_retry_messages; /* retry broadcasts */ unsigned long s_bau_reenabled; /* for bau enable/disable */ unsigned long s_bau_disabled; /* for bau enable/disable */ + unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ + unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ + unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ /* destination statistics */ unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ @@ -593,6 +596,8 @@ struct bau_control { short cpus_in_socket; short cpus_in_uvhub; short partition_base_pnode; + short using_desc; /* an index, like uvhub_cpu */ + unsigned int inuse_map; unsigned short message_number; unsigned short uvhub_quiesce; short socket_acknowledge_count[DEST_Q_SIZE]; @@ -610,6 +615,7 @@ struct bau_control { int cong_response_us; int cong_reps; int cong_period; + unsigned long clocks_per_100_usec; cycles_t period_time; long period_requests; struct hub_and_pnode *thp; @@ -670,6 +676,11 @@ static inline void write_mmr_sw_ack(unsigned long mr) uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); } +static inline void write_gmmr_sw_ack(int pnode, unsigned long mr) +{ + write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); +} + static inline unsigned long read_mmr_sw_ack(void) { return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index c425ff1a9cc3..9010ca715c03 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -157,13 +157,14 @@ static int __init uvhub_to_first_apicid(int uvhub) * clear of the Timeout bit (as well) will free the resource. No reply will * be sent (the hardware will only do one reply per message). */ -static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp) +static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp, + int do_acknowledge) { unsigned long dw; struct bau_pq_entry *msg; msg = mdp->msg; - if (!msg->canceled) { + if (!msg->canceled && do_acknowledge) { dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec; write_mmr_sw_ack(dw); } @@ -212,8 +213,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp, if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { unsigned long mr; /* - * is the resource timed out? - * make everyone ignore the cancelled message. + * Is the resource timed out? + * Make everyone ignore the cancelled message. */ msg2->canceled = 1; stat->d_canceled++; @@ -231,8 +232,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp, * Do all the things a cpu should do for a TLB shootdown message. * Other cpu's may come here at the same time for this message. */ -static void bau_process_message(struct msg_desc *mdp, - struct bau_control *bcp) +static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, + int do_acknowledge) { short socket_ack_count = 0; short *sp; @@ -284,8 +285,9 @@ static void bau_process_message(struct msg_desc *mdp, if (msg_ack_count == bcp->cpus_in_uvhub) { /* * All cpus in uvhub saw it; reply + * (unless we are in the UV2 workaround) */ - reply_to_message(mdp, bcp); + reply_to_message(mdp, bcp, do_acknowledge); } } @@ -491,27 +493,138 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, /* * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. */ -static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu) +static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) { unsigned long descriptor_status; unsigned long descriptor_status2; descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); - descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL; + descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; descriptor_status = (descriptor_status << 1) | descriptor_status2; return descriptor_status; } +/* + * Return whether the status of the descriptor that is normally used for this + * cpu (the one indexed by its hub-relative cpu number) is busy. + * The status of the original 32 descriptors is always reflected in the 64 + * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0. + * The bit provided by the activation_status_2 register is irrelevant to + * the status if it is only being tested for busy or not busy. + */ +int normal_busy(struct bau_control *bcp) +{ + int cpu = bcp->uvhub_cpu; + int mmr_offset; + int right_shift; + + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; + right_shift = cpu * UV_ACT_STATUS_SIZE; + return (((((read_lmmr(mmr_offset) >> right_shift) & + UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY); +} + +/* + * Entered when a bau descriptor has gone into a permanent busy wait because + * of a hardware bug. + * Workaround the bug. + */ +int handle_uv2_busy(struct bau_control *bcp) +{ + int busy_one = bcp->using_desc; + int normal = bcp->uvhub_cpu; + int selected = -1; + int i; + unsigned long descriptor_status; + unsigned long status; + int mmr_offset; + struct bau_desc *bau_desc_old; + struct bau_desc *bau_desc_new; + struct bau_control *hmaster = bcp->uvhub_master; + struct ptc_stats *stat = bcp->statp; + cycles_t ttm; + + stat->s_uv2_wars++; + spin_lock(&hmaster->uvhub_lock); + /* try for the original first */ + if (busy_one != normal) { + if (!normal_busy(bcp)) + selected = normal; + } + if (selected < 0) { + /* can't use the normal, select an alternate */ + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; + descriptor_status = read_lmmr(mmr_offset); + + /* scan available descriptors 32-63 */ + for (i = 0; i < UV_CPUS_PER_AS; i++) { + if ((hmaster->inuse_map & (1 << i)) == 0) { + status = ((descriptor_status >> + (i * UV_ACT_STATUS_SIZE)) & + UV_ACT_STATUS_MASK) << 1; + if (status != UV2H_DESC_BUSY) { + selected = i + UV_CPUS_PER_AS; + break; + } + } + } + } + + if (busy_one != normal) + /* mark the busy alternate as not in-use */ + hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS)); + + if (selected >= 0) { + /* switch to the selected descriptor */ + if (selected != normal) { + /* set the selected alternate as in-use */ + hmaster->inuse_map |= + (1 << (selected - UV_CPUS_PER_AS)); + if (selected > stat->s_uv2_wars_hw) + stat->s_uv2_wars_hw = selected; + } + bau_desc_old = bcp->descriptor_base; + bau_desc_old += (ITEMS_PER_DESC * busy_one); + bcp->using_desc = selected; + bau_desc_new = bcp->descriptor_base; + bau_desc_new += (ITEMS_PER_DESC * selected); + *bau_desc_new = *bau_desc_old; + } else { + /* + * All are busy. Wait for the normal one for this cpu to + * free up. + */ + stat->s_uv2_war_waits++; + spin_unlock(&hmaster->uvhub_lock); + ttm = get_cycles(); + do { + cpu_relax(); + } while (normal_busy(bcp)); + spin_lock(&hmaster->uvhub_lock); + /* switch to the original descriptor */ + bcp->using_desc = normal; + bau_desc_old = bcp->descriptor_base; + bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc); + bcp->using_desc = (ITEMS_PER_DESC * normal); + bau_desc_new = bcp->descriptor_base; + bau_desc_new += (ITEMS_PER_DESC * normal); + *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */ + } + spin_unlock(&hmaster->uvhub_lock); + return FLUSH_RETRY_BUSYBUG; +} + static int uv2_wait_completion(struct bau_desc *bau_desc, unsigned long mmr_offset, int right_shift, struct bau_control *bcp, long try) { unsigned long descriptor_stat; cycles_t ttm; - int cpu = bcp->uvhub_cpu; + int desc = bcp->using_desc; + long busy_reps = 0; struct ptc_stats *stat = bcp->statp; - descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); + descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc); /* spin on the status MMR, waiting for it to go idle */ while (descriptor_stat != UV2H_DESC_IDLE) { @@ -542,12 +655,23 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, bcp->conseccompletes = 0; return FLUSH_RETRY_TIMEOUT; } else { + busy_reps++; + if (busy_reps > 1000000) { + /* not to hammer on the clock */ + busy_reps = 0; + ttm = get_cycles(); + if ((ttm - bcp->send_message) > + (bcp->clocks_per_100_usec)) { + return handle_uv2_busy(bcp); + } + } /* * descriptor_stat is still BUSY */ cpu_relax(); } - descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); + descriptor_stat = uv2_read_status(mmr_offset, right_shift, + desc); } bcp->conseccompletes++; return FLUSH_COMPLETE; @@ -563,14 +687,14 @@ static int wait_completion(struct bau_desc *bau_desc, { int right_shift; unsigned long mmr_offset; - int cpu = bcp->uvhub_cpu; + int desc = bcp->using_desc; - if (cpu < UV_CPUS_PER_AS) { + if (desc < UV_CPUS_PER_AS) { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = cpu * UV_ACT_STATUS_SIZE; + right_shift = desc * UV_ACT_STATUS_SIZE; } else { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); + right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); } if (bcp->uvhub_version == 1) @@ -752,8 +876,7 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, * Returns 1 if it gives up entirely and the original cpu mask is to be * returned to the kernel. */ -int uv_flush_send_and_wait(struct bau_desc *bau_desc, - struct cpumask *flush_mask, struct bau_control *bcp) +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) { int seq_number = 0; int completion_stat = 0; @@ -766,20 +889,24 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, struct bau_control *hmaster = bcp->uvhub_master; struct uv1_bau_msg_header *uv1_hdr = NULL; struct uv2_bau_msg_header *uv2_hdr = NULL; + struct bau_desc *bau_desc; - if (bcp->uvhub_version == 1) { - uv1 = 1; + if (bcp->uvhub_version == 1) uv1_throttle(hmaster, stat); - uv1_hdr = &bau_desc->header.uv1_hdr; - } else - uv2_hdr = &bau_desc->header.uv2_hdr; while (hmaster->uvhub_quiesce) cpu_relax(); time1 = get_cycles(); do { - if (try == 0) { + bau_desc = bcp->descriptor_base; + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); + if (bcp->uvhub_version == 1) { + uv1 = 1; + uv1_hdr = &bau_desc->header.uv1_hdr; + } else + uv2_hdr = &bau_desc->header.uv2_hdr; + if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) { if (uv1) uv1_hdr->msg_type = MSG_REGULAR; else @@ -797,13 +924,14 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, uv1_hdr->sequence = seq_number; else uv2_hdr->sequence = seq_number; - index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; + index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; bcp->send_message = get_cycles(); write_mmr_activation(index); try++; completion_stat = wait_completion(bau_desc, bcp, try); + /* UV2: wait_completion() may change the bcp->using_desc */ handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); @@ -814,6 +942,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, } cpu_relax(); } while ((completion_stat == FLUSH_RETRY_PLUGGED) || + (completion_stat == FLUSH_RETRY_BUSYBUG) || (completion_stat == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); @@ -828,6 +957,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, record_send_stats(time1, time2, bcp, stat, completion_stat, try); if (completion_stat == FLUSH_GIVEUP) + /* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */ return 1; return 0; } @@ -983,7 +1113,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, stat->s_ntargself++; bau_desc = bcp->descriptor_base; - bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu); + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) return NULL; @@ -996,12 +1126,85 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, * uv_flush_send_and_wait returns 0 if all cpu's were messaged, * or 1 if it gave up and the original cpumask should be returned. */ - if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) + if (!uv_flush_send_and_wait(flush_mask, bcp)) return NULL; else return cpumask; } +/* + * Search the message queue for any 'other' message with the same software + * acknowledge resource bit vector. + */ +struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, + struct bau_control *bcp, unsigned char swack_vec) +{ + struct bau_pq_entry *msg_next = msg + 1; + + if (msg_next > bcp->queue_last) + msg_next = bcp->queue_first; + while ((msg_next->swack_vec != 0) && (msg_next != msg)) { + if (msg_next->swack_vec == swack_vec) + return msg_next; + msg_next++; + if (msg_next > bcp->queue_last) + msg_next = bcp->queue_first; + } + return NULL; +} + +/* + * UV2 needs to work around a bug in which an arriving message has not + * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register. + * Such a message must be ignored. + */ +void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) +{ + unsigned long mmr_image; + unsigned char swack_vec; + struct bau_pq_entry *msg = mdp->msg; + struct bau_pq_entry *other_msg; + + mmr_image = read_mmr_sw_ack(); + swack_vec = msg->swack_vec; + + if ((swack_vec & mmr_image) == 0) { + /* + * This message was assigned a swack resource, but no + * reserved acknowlegment is pending. + * The bug has prevented this message from setting the MMR. + * And no other message has used the same sw_ack resource. + * Do the requested shootdown but do not reply to the msg. + * (the 0 means make no acknowledge) + */ + bau_process_message(mdp, bcp, 0); + return; + } + + /* + * Some message has set the MMR 'pending' bit; it might have been + * another message. Look for that message. + */ + other_msg = find_another_by_swack(msg, bcp, msg->swack_vec); + if (other_msg) { + /* There is another. Do not ack the current one. */ + bau_process_message(mdp, bcp, 0); + /* + * Let the natural processing of that message acknowledge + * it. Don't get the processing of sw_ack's out of order. + */ + return; + } + + /* + * There is no other message using this sw_ack, so it is safe to + * acknowledge it. + */ + bau_process_message(mdp, bcp, 1); + + return; +} + /* * The BAU message interrupt comes here. (registered by set_intr_gate) * See entry_64.S @@ -1038,9 +1241,11 @@ void uv_bau_message_interrupt(struct pt_regs *regs) count++; msgdesc.msg_slot = msg - msgdesc.queue_first; - msgdesc.swack_slot = ffs(msg->swack_vec) - 1; msgdesc.msg = msg; - bau_process_message(&msgdesc, bcp); + if (bcp->uvhub_version == 2) + process_uv2_message(&msgdesc, bcp); + else + bau_process_message(&msgdesc, bcp, 1); msg++; if (msg > msgdesc.queue_last) @@ -1158,7 +1363,7 @@ static int ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "all one mult none retry canc nocan reset rcan "); seq_printf(file, - "disable enable\n"); + "disable enable wars warshw warwaits\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { stat = &per_cpu(ptcstats, cpu); @@ -1189,8 +1394,10 @@ static int ptc_seq_show(struct seq_file *file, void *data) stat->d_nomsg, stat->d_retries, stat->d_canceled, stat->d_nocanceled, stat->d_resets, stat->d_rcanceled); - seq_printf(file, "%ld %ld\n", - stat->s_bau_disabled, stat->s_bau_reenabled); + seq_printf(file, "%ld %ld %ld %ld %ld\n", + stat->s_bau_disabled, stat->s_bau_reenabled, + stat->s_uv2_wars, stat->s_uv2_wars_hw, + stat->s_uv2_war_waits); } return 0; } @@ -1564,6 +1771,7 @@ static void pq_init(int node, int pnode) write_mmr_payload_first(pnode, pn_first); write_mmr_payload_tail(pnode, first); write_mmr_payload_last(pnode, last); + write_gmmr_sw_ack(pnode, 0xffffUL); /* in effect, all msg_type's are set to MSG_NOOP */ memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); @@ -1651,6 +1859,7 @@ static void __init init_per_cpu_tunables(void) bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; bcp->cong_period = congested_period; + bcp->clocks_per_100_usec = usec_2_cycles(100); } } @@ -1771,6 +1980,7 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, } bcp->uvhub_master = *hmasterp; bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; + bcp->using_desc = bcp->uvhub_cpu; if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { printk(KERN_EMERG "%d cpus per uvhub invalid\n", bcp->uvhub_cpu);