/*
 * IR - Lightweight JIT Compilation Framework
 * (RA - Register Allocation, Liveness, Coalescing, SSA Deconstruction)
 * Copyright (C) 2022 Zend by Perforce.
 * Authors: Dmitry Stogov <dmitry@php.net>
 *
 * See: "Linear Scan Register Allocation on SSA Form", Christian Wimmer and
 * Michael Franz, CGO'10 (2010)
 * See: "Optimized Interval Splitting in a Linear Scan Register Allocator",
 * Christian Wimmer VEE'10 (2005)
 */

#ifndef _GNU_SOURCE
# define _GNU_SOURCE
#endif

#include <stdlib.h>
#include "ir.h"

#if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
# include "ir_x86.h"
#elif defined(IR_TARGET_AARCH64)
# include "ir_aarch64.h"
#else
# error "Unknown IR target"
#endif

#include "ir_private.h"

int ir_regs_number(void)
{
	return IR_REG_NUM;
}

bool ir_reg_is_int(int32_t reg)
{
	IR_ASSERT(reg >= 0 && reg < IR_REG_NUM);
	return reg >= IR_REG_GP_FIRST && reg <= IR_REG_GP_LAST;
}

static int ir_assign_virtual_registers_slow(ir_ctx *ctx)
{
	uint32_t *vregs;
	uint32_t vregs_count = 0;
	uint32_t b;
	ir_ref i, n;
	ir_block *bb;
	ir_insn *insn;
	uint32_t flags;

	/* Assign unique virtual register to each data node */
	vregs = ir_mem_calloc(ctx->insns_count, sizeof(ir_ref));
	n = 1;
	for (b = 1, bb = ctx->cfg_blocks + b; b <= ctx->cfg_blocks_count; b++, bb++) {
		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
		i = bb->start;

		/* skip first instruction */
		insn = ctx->ir_base + i;
		n = ir_insn_len(insn);
		i += n;
		insn += n;
		while (i < bb->end) {
			flags = ir_op_flags[insn->op];
			if (((flags & IR_OP_FLAG_DATA) && insn->op != IR_VAR && (insn->op != IR_PARAM || ctx->use_lists[i].count > 0))
			 || ((flags & IR_OP_FLAG_MEM) && ctx->use_lists[i].count > 1)) {
				if (!ctx->rules || !(ctx->rules[i] & (IR_FUSED|IR_SKIPPED))) {
					vregs[i] = ++vregs_count;
				}
			}
			n = ir_insn_len(insn);
			i += n;
			insn += n;
		}
	}
	ctx->vregs_count = vregs_count;
	ctx->vregs = vregs;

	return 1;
}

int ir_assign_virtual_registers(ir_ctx *ctx)
{
	uint32_t *vregs;
	uint32_t vregs_count = 0;
	ir_ref i;
	ir_insn *insn;

	if (!ctx->rules) {
		return ir_assign_virtual_registers_slow(ctx);
	}

	/* Assign unique virtual register to each rule that needs it */
	vregs = ir_mem_malloc(ctx->insns_count * sizeof(ir_ref));

	for (i = 1, insn = &ctx->ir_base[1]; i < ctx->insns_count; i++, insn++) {
		uint32_t v = 0;

		if (ctx->rules[i] && !(ctx->rules[i] & (IR_FUSED|IR_SKIPPED))) {
			uint32_t flags = ir_op_flags[insn->op];

			if ((flags & IR_OP_FLAG_DATA)
			 || ((flags & IR_OP_FLAG_MEM) && ctx->use_lists[i].count > 1)) {
				v = ++vregs_count;
			}
		}
		vregs[i] = v;
	}

	ctx->vregs_count = vregs_count;
	ctx->vregs = vregs;

	return 1;
}

/* Lifetime intervals construction */

static ir_live_interval *ir_new_live_range(ir_ctx *ctx, int v, ir_live_pos start, ir_live_pos end)
{
	ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));

	ival->type = IR_VOID;
	ival->reg = IR_REG_NONE;
	ival->flags = 0;
	ival->vreg = v;
	ival->stack_spill_pos = -1; // not allocated
	ival->range.start = start;
	ival->range.end = ival->end = end;
	ival->range.next = NULL;
	ival->use_pos = NULL;
	ival->next = NULL;

	ctx->live_intervals[v] = ival;
	return ival;
}

static ir_live_interval *ir_add_live_range(ir_ctx *ctx, int v, ir_live_pos start, ir_live_pos end)
{
	ir_live_interval *ival = ctx->live_intervals[v];
	ir_live_range *p, *q;

	if (!ival) {
		return ir_new_live_range(ctx, v, start, end);
	}

	p = &ival->range;
	if (end >= p->start) {
		ir_live_range *prev = NULL;

		do {
			if (p->end >= start) {
				if (start < p->start) {
					p->start = start;
				}
				if (end > p->end) {
					/* merge with next */
					ir_live_range *next = p->next;

					p->end = end;
					while (next && p->end >= next->start) {
						if (next->end > p->end) {
							p->end = next->end;
						}
						p->next = next->next;
						/* remember in the "unused_ranges" list */
						next->next = ctx->unused_ranges;
						ctx->unused_ranges = next;
						next = p->next;
					}
					if (!p->next) {
						ival->end = p->end;
					}
				}
				return ival;
			}
			prev = p;
			p = prev->next;
		} while (p && end >= p->start);
		if (!p) {
			ival->end = end;
		}
		if (prev) {
			if (ctx->unused_ranges) {
				/* reuse */
				q = ctx->unused_ranges;
				ctx->unused_ranges = q->next;
			} else {
				q = ir_arena_alloc(&ctx->arena, sizeof(ir_live_range));
			}
			prev->next = q;
			q->start = start;
			q->end = end;
			q->next = p;
			return ival;
		}
	}

	if (ctx->unused_ranges) {
		/* reuse */
		q = ctx->unused_ranges;
		ctx->unused_ranges = q->next;
	} else {
		q = ir_arena_alloc(&ctx->arena, sizeof(ir_live_range));
	}
	q->start = p->start;
	q->end = p->end;
	q->next = p->next;
	p->start = start;
	p->end = end;
	p->next = q;
	return ival;
}

IR_ALWAYS_INLINE ir_live_interval *ir_add_prev_live_range(ir_ctx *ctx, int v, ir_live_pos start, ir_live_pos end)
{
	ir_live_interval *ival = ctx->live_intervals[v];

	if (ival && ival->range.start == end) {
		ival->range.start = start;
		return ival;
	}
	return ir_add_live_range(ctx, v, start, end);
}

static void ir_add_fixed_live_range(ir_ctx *ctx, ir_reg reg, ir_live_pos start, ir_live_pos end)
{
	int v = ctx->vregs_count + 1 + reg;
	ir_live_interval *ival = ctx->live_intervals[v];
	ir_live_range *q;

	if (!ival) {
		ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
		ival->type = IR_VOID;
		ival->reg = reg;
		ival->flags = IR_LIVE_INTERVAL_FIXED;
		ival->vreg = v;
		ival->stack_spill_pos = -1; // not allocated
		ival->range.start = start;
		ival->range.end = ival->end = end;
		ival->range.next = NULL;
		ival->use_pos = NULL;
		ival->next = NULL;

		ctx->live_intervals[v] = ival;
	} else if (EXPECTED(end < ival->range.start)) {
		if (ctx->unused_ranges) {
			/* reuse */
			q = ctx->unused_ranges;
			ctx->unused_ranges = q->next;
		} else {
			q = ir_arena_alloc(&ctx->arena, sizeof(ir_live_range));
		}

		q->start = ival->range.start;
		q->end = ival->range.end;
		q->next = ival->range.next;
		ival->range.start = start;
		ival->range.end = end;
		ival->range.next = q;
	} else if (end == ival->range.start) {
		ival->range.start = start;
	} else {
		ir_add_live_range(ctx, v, start, end);
	}
}

static void ir_add_tmp(ir_ctx *ctx, ir_ref ref, ir_ref tmp_ref, int32_t tmp_op_num, ir_tmp_reg tmp_reg)
{
	ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));

	ival->type = tmp_reg.type;
	ival->reg = IR_REG_NONE;
	ival->flags = IR_LIVE_INTERVAL_TEMP;
	ival->tmp_ref = tmp_ref;
	ival->tmp_op_num = tmp_op_num;
	ival->range.start = IR_START_LIVE_POS_FROM_REF(ref) + tmp_reg.start;
	ival->range.end = ival->end = IR_START_LIVE_POS_FROM_REF(ref) + tmp_reg.end;
	ival->range.next = NULL;
	ival->use_pos = NULL;

	if (!ctx->live_intervals[0]) {
		ival->next = NULL;
		ctx->live_intervals[0] = ival;
	} else if (ival->range.start >= ctx->live_intervals[0]->range.start) {
		ir_live_interval *prev = ctx->live_intervals[0];

		while (prev->next && ival->range.start >= prev->next->range.start) {
			prev = prev->next;
		}
		ival->next = prev->next;
		prev->next = ival;
	} else {
		ir_live_interval *next = ctx->live_intervals[0];

		ival->next = next;
		ctx->live_intervals[0] = ival;
	}
	return;
}

static bool ir_has_tmp(ir_ctx *ctx, ir_ref ref, int32_t op_num)
{
	ir_live_interval *ival = ctx->live_intervals[0];

	if (ival) {
		while (ival && IR_LIVE_POS_TO_REF(ival->range.start) <= ref) {
			if (ival->tmp_ref == ref && ival->tmp_op_num == op_num) {
				return 1;
			}
			ival = ival->next;
		}
	}
	return 0;
}

static ir_live_interval *ir_fix_live_range(ir_ctx *ctx, int v, ir_live_pos old_start, ir_live_pos new_start)
{
	ir_live_interval *ival = ctx->live_intervals[v];
	ir_live_range *p = &ival->range;

#if 0
	while (p && p->start < old_start) {
		p = p->next;
	}
#endif
	IR_ASSERT(ival && p->start == old_start);
	p->start = new_start;
	return ival;
}

static void ir_add_use_pos(ir_ctx *ctx, ir_live_interval *ival, ir_use_pos *use_pos)
{
	ir_use_pos *p = ival->use_pos;

	if (EXPECTED(!p || p->pos > use_pos->pos)) {
		use_pos->next = p;
		ival->use_pos = use_pos;
	} else {
		ir_use_pos *prev;

		do {
			prev = p;
			p = p->next;
		} while (p && p->pos < use_pos->pos);

		use_pos->next = prev->next;
		prev->next = use_pos;
	}
}


IR_ALWAYS_INLINE void ir_add_use(ir_ctx *ctx, ir_live_interval *ival, int op_num, ir_live_pos pos, ir_reg hint, uint8_t use_flags, ir_ref hint_ref)
{
	ir_use_pos *use_pos;

	use_pos = ir_arena_alloc(&ctx->arena, sizeof(ir_use_pos));
	use_pos->op_num = op_num;
	use_pos->hint = hint;
	use_pos->flags = use_flags;
	use_pos->hint_ref = hint_ref;
	use_pos->pos = pos;

	if (hint != IR_REG_NONE) {
		ival->flags |= IR_LIVE_INTERVAL_HAS_HINT_REGS;
	}
	if (hint_ref > 0) {
		ival->flags |= IR_LIVE_INTERVAL_HAS_HINT_REFS;
	}

	ir_add_use_pos(ctx, ival, use_pos);
}

static void ir_add_phi_use(ir_ctx *ctx, ir_live_interval *ival, int op_num, ir_live_pos pos, ir_ref phi_ref)
{
	ir_use_pos *use_pos;

	IR_ASSERT(phi_ref > 0);
	use_pos = ir_arena_alloc(&ctx->arena, sizeof(ir_use_pos));
	use_pos->op_num = op_num;
	use_pos->hint = IR_REG_NONE;
	use_pos->flags = IR_PHI_USE | IR_USE_SHOULD_BE_IN_REG; // TODO: ???
	use_pos->hint_ref = -phi_ref;
	use_pos->pos = pos;

	ir_add_use_pos(ctx, ival, use_pos);
}

static void ir_add_hint(ir_ctx *ctx, ir_ref ref, ir_live_pos pos, ir_reg hint)
{
	ir_live_interval *ival = ctx->live_intervals[ctx->vregs[ref]];

	if (!(ival->flags & IR_LIVE_INTERVAL_HAS_HINT_REGS)) {
		ir_use_pos *use_pos = ival->use_pos;

		while (use_pos) {
			if (use_pos->pos == pos) {
				if (use_pos->hint == IR_REG_NONE) {
					use_pos->hint = hint;
					ival->flags |= IR_LIVE_INTERVAL_HAS_HINT_REGS;
				}
			}
			use_pos = use_pos->next;
		}
	}
}

static void ir_hint_propagation(ir_ctx *ctx)
{
	int i;
	ir_live_interval *ival;
	ir_use_pos *use_pos;
	ir_use_pos *hint_use_pos;

	for (i = ctx->vregs_count; i > 0; i--) {
		ival = ctx->live_intervals[i];
		if (ival
		 && (ival->flags & (IR_LIVE_INTERVAL_HAS_HINT_REGS|IR_LIVE_INTERVAL_HAS_HINT_REFS)) == (IR_LIVE_INTERVAL_HAS_HINT_REGS|IR_LIVE_INTERVAL_HAS_HINT_REFS)) {
			use_pos = ival->use_pos;
			hint_use_pos = NULL;
			while (use_pos) {
				if (use_pos->op_num == 0) {
					if (use_pos->hint_ref > 0) {
						hint_use_pos = use_pos;
					}
				} else if (use_pos->hint != IR_REG_NONE) {
					if (hint_use_pos) {
						ir_add_hint(ctx, hint_use_pos->hint_ref, hint_use_pos->pos, use_pos->hint);
						hint_use_pos = NULL;
					}
				}
				use_pos = use_pos->next;
			}
		}
	}
}

#ifdef IR_BITSET_LIVENESS
/* DFS + Loop-Forest livness for SSA using bitset(s) */
static void ir_add_osr_entry_loads(ir_ctx *ctx, ir_block *bb, ir_bitset live, uint32_t len, uint32_t b)
{
	bool ok = 1;
	int count = 0;
	ir_list *list = (ir_list*)ctx->osr_entry_loads;
	ir_ref i;

	IR_BITSET_FOREACH(live, len, i) {
		/* Skip live references from ENTRY to PARAM. TODO: duplicate PARAM in each ENTRY ??? */
		ir_use_pos *use_pos = ctx->live_intervals[i]->use_pos;
		ir_ref ref = (use_pos->hint_ref < 0) ? -use_pos->hint_ref : IR_LIVE_POS_TO_REF(use_pos->pos);

		if (use_pos->op_num) {
			ir_ref *ops = ctx->ir_base[ref].ops;
			ref = ops[use_pos->op_num];
		}

		if (ctx->ir_base[ref].op == IR_PARAM) {
			continue;
		}
		if (ctx->binding) {
			ir_ref var = ir_binding_find(ctx, ref);
			if (var < 0) {
				/* We may load the value at OSR entry-point */
				if (!count) {
					bb->flags &= ~IR_BB_EMPTY;
					bb->flags |= IR_BB_OSR_ENTRY_LOADS;
					if (!ctx->osr_entry_loads) {
						list = ctx->osr_entry_loads = ir_mem_malloc(sizeof(ir_list));
						ir_list_init(list, 16);
					}
					ir_list_push(list, b);
					ir_list_push(list, 0);
				}
				ir_list_push(list, ref);
				count++;
				continue;
			}
		}
		fprintf(stderr, "ENTRY %d (block %d start %d) - live var %d\n", ctx->ir_base[bb->start].op2, b, bb->start, ref);
		ok = 0;
	} IR_BITSET_FOREACH_END();

	if (!ok) {
		IR_ASSERT(0);
	}
	if (count) {
		ir_list_set(list, ir_list_len(ctx->osr_entry_loads) - (count + 1), count);

#if 0
		/* ENTRY "clobbers" all registers */
		ir_ref ref = ctx->ir_base[bb->start].op1;
		ir_add_fixed_live_range(ctx, IR_REG_ALL,
			IR_DEF_LIVE_POS_FROM_REF(ref),
			IR_SAVE_LIVE_POS_FROM_REF(ref));
#endif
	}
}

static void ir_add_fusion_ranges(ir_ctx *ctx, ir_ref ref, ir_ref input, ir_block *bb, ir_bitset live)
{
	ir_ref stack[4];
	int stack_pos = 0;
	ir_target_constraints constraints;
	ir_insn *insn;
	uint32_t j, n, flags, def_flags;
	ir_ref *p, child;
	uint8_t use_flags;
	ir_reg reg;
	ir_live_pos use_pos;
	ir_live_interval *ival;

	while (1) {
		IR_ASSERT(input > 0 && ctx->rules[input] & IR_FUSED);

		if (!(ctx->rules[input] & IR_SIMPLE)) {
			def_flags = ir_get_target_constraints(ctx, input, &constraints);
			n = constraints.tmps_count;
			while (n > 0) {
				n--;
				if (constraints.tmp_regs[n].type) {
					ir_add_tmp(ctx, ref, input, constraints.tmp_regs[n].num, constraints.tmp_regs[n]);
				} else {
					/* CPU specific constraints */
					ir_add_fixed_live_range(ctx, constraints.tmp_regs[n].reg,
						IR_START_LIVE_POS_FROM_REF(ref) + constraints.tmp_regs[n].start,
						IR_START_LIVE_POS_FROM_REF(ref) + constraints.tmp_regs[n].end);
				}
			}
		} else {
			def_flags = IR_OP1_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
			constraints.hints_count = 0;
		}

		insn = &ctx->ir_base[input];
		flags = ir_op_flags[insn->op];
		n = IR_INPUT_EDGES_COUNT(flags);
		j = 1;
		p = insn->ops + j;
		if (flags & IR_OP_FLAG_CONTROL) {
			j++;
			p++;
		}
		for (; j <= n; j++, p++) {
			IR_ASSERT(IR_OPND_KIND(flags, j) == IR_OPND_DATA);
			child = *p;
			if (child > 0) {
				uint32_t v = ctx->vregs[child];

				if (v) {
					use_flags = IR_FUSED_USE | IR_USE_FLAGS(def_flags, j);
					reg = (j < constraints.hints_count) ? constraints.hints[j] : IR_REG_NONE;
					use_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
					if (EXPECTED(reg == IR_REG_NONE)) {
						use_pos += IR_USE_SUB_REF;
					}

					if (!ir_bitset_in(live, v)) {
						/* live.add(opd) */
						ir_bitset_incl(live, v);
						/* intervals[opd].addRange(b.from, op.id) */
						ival = ir_add_live_range(ctx, v,
							IR_START_LIVE_POS_FROM_REF(bb->start), use_pos);
					} else {
						ival = ctx->live_intervals[v];
					}
					ir_add_use(ctx, ival, j, use_pos, reg, use_flags, -input);
				} else if (ctx->rules[child] & IR_FUSED) {
					IR_ASSERT(stack_pos < (int)(sizeof(stack)/sizeof(stack_pos)));
					stack[stack_pos++] = child;
				} else if (ctx->rules[child] == (IR_SKIPPED|IR_RLOAD)) {
					ir_set_alocated_reg(ctx, input, j, ctx->ir_base[child].op2);
				}
			}
		}
		if (!stack_pos) {
			break;
		}
		input = stack[--stack_pos];
	}
}

int ir_compute_live_ranges(ir_ctx *ctx)
{
	uint32_t b, i, j, k, n, succ, *p;
	ir_ref ref;
	uint32_t len;
	ir_insn *insn;
	ir_block *bb, *succ_bb;
#ifdef IR_DEBUG
	ir_bitset visited;
#endif
	ir_bitset live, bb_live;
	ir_bitset loops = NULL;
	ir_bitqueue queue;
	ir_live_interval *ival;

	if (!(ctx->flags2 & IR_LINEAR) || !ctx->vregs) {
		return 0;
	}

	if (ctx->rules) {
		ctx->regs = ir_mem_malloc(sizeof(ir_regs) * ctx->insns_count);
		memset(ctx->regs, IR_REG_NONE, sizeof(ir_regs) * ctx->insns_count);
	}

	/* Root of the list of IR_VARs */
	ctx->vars = IR_UNUSED;

	/* Compute Live Ranges */
	ctx->flags2 &= ~IR_LR_HAVE_DESSA_MOVES;
	len = ir_bitset_len(ctx->vregs_count + 1);
	bb_live = ir_mem_malloc((ctx->cfg_blocks_count + 1) * len * sizeof(ir_bitset_base_t));

	/* vregs + tmp + fixed + SRATCH + ALL */
	ctx->live_intervals = ir_mem_calloc(ctx->vregs_count + 1 + IR_REG_NUM + 2, sizeof(ir_live_interval*));

#ifdef IR_DEBUG
	visited = ir_bitset_malloc(ctx->cfg_blocks_count + 1);
#endif

    if (!ctx->arena) {
		ctx->arena = ir_arena_create(16 * 1024);
	}

	/* for each basic block in reverse order */
	for (b = ctx->cfg_blocks_count; b > 0; b--) {
		bb = &ctx->cfg_blocks[b];
		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
		/* for each successor of b */

#ifdef IR_DEBUG
		ir_bitset_incl(visited, b);
#endif
		live = bb_live + (len * b);
		n = bb->successors_count;
		if (n == 0) {
			ir_bitset_clear(live, len);
		} else {
			p = &ctx->cfg_edges[bb->successors];
			succ = *p;

#ifdef IR_DEBUG
			/* blocks must be ordered where all dominators of a block are before this block */
			IR_ASSERT(ir_bitset_in(visited, succ) || bb->loop_header == succ);
#endif

			/* live = union of successors.liveIn */
			if (EXPECTED(succ > b) && EXPECTED(!(ctx->cfg_blocks[succ].flags & IR_BB_ENTRY))) {
				ir_bitset_copy(live, bb_live + (len * succ), len);
			} else {
				IR_ASSERT(succ > b || (ctx->cfg_blocks[succ].flags & IR_BB_LOOP_HEADER));
				ir_bitset_clear(live, len);
			}
			if (n > 1) {
				for (p++, n--; n > 0; p++, n--) {
					succ = *p;
					if (EXPECTED(succ > b) && EXPECTED(!(ctx->cfg_blocks[succ].flags & IR_BB_ENTRY))) {
						ir_bitset_union(live, bb_live + (len * succ), len);
					} else {
						IR_ASSERT(succ > b || (ctx->cfg_blocks[succ].flags & IR_BB_LOOP_HEADER));
					}
				}
			}

			/* for each opd in live */
			IR_BITSET_FOREACH(live, len, i) {
				/* intervals[opd].addRange(b.from, b.to) */
				ir_add_prev_live_range(ctx, i,
					IR_START_LIVE_POS_FROM_REF(bb->start),
					IR_END_LIVE_POS_FROM_REF(bb->end));
			} IR_BITSET_FOREACH_END();
		}

		if (bb->successors_count == 1) {
			/* for each phi function phi of successor */
			succ = ctx->cfg_edges[bb->successors];
			succ_bb = &ctx->cfg_blocks[succ];
			if (succ_bb->flags & IR_BB_HAS_PHI) {
				ir_use_list *use_list = &ctx->use_lists[succ_bb->start];

				k = ir_phi_input_number(ctx, succ_bb, b);
				IR_ASSERT(k != 0);
				for (ref = 0; ref < use_list->count; ref++) {
					ir_ref use = ctx->use_edges[use_list->refs + ref];
					insn = &ctx->ir_base[use];
					if (insn->op == IR_PHI) {
						ir_ref input = ir_insn_op(insn, k);
						if (input > 0) {
							uint32_t v = ctx->vregs[input];

							/* live.add(phi.inputOf(b)) */
							IR_ASSERT(v);
							ir_bitset_incl(live, v);
							/* intervals[phi.inputOf(b)].addRange(b.from, b.to) */
							ival = ir_add_prev_live_range(ctx, v,
								IR_START_LIVE_POS_FROM_REF(bb->start),
								IR_END_LIVE_POS_FROM_REF(bb->end));
							ir_add_phi_use(ctx, ival, k, IR_DEF_LIVE_POS_FROM_REF(bb->end), use);
						}
					}
				}
			}
		}

		/* for each operation op of b in reverse order */
		ref = bb->end;
		insn = &ctx->ir_base[ref];
		if (insn->op == IR_END || insn->op == IR_LOOP_END) {
			ref = ctx->prev_ref[ref];
		}
		for (; ref > bb->start; ref = ctx->prev_ref[ref]) {
			uint32_t def_flags;
			uint32_t flags;
			ir_ref *p;
			ir_target_constraints constraints;
			uint32_t v;

			if (ctx->rules) {
				int n;

				if (ctx->rules[ref] & (IR_FUSED|IR_SKIPPED)) {
					if (((ctx->rules[ref] & IR_RULE_MASK) == IR_VAR
					  || (ctx->rules[ref] & IR_RULE_MASK) == IR_ALLOCA)
					 && ctx->use_lists[ref].count > 0) {
						insn = &ctx->ir_base[ref];
						if (insn->op != IR_VADDR) {
							insn->op3 = ctx->vars;
							ctx->vars = ref;
						}
					}
					continue;
				}

				def_flags = ir_get_target_constraints(ctx, ref, &constraints);
				n = constraints.tmps_count;
				while (n > 0) {
					n--;
					if (constraints.tmp_regs[n].type) {
						ir_add_tmp(ctx, ref, ref, constraints.tmp_regs[n].num, constraints.tmp_regs[n]);
					} else {
						/* CPU specific constraints */
						ir_add_fixed_live_range(ctx, constraints.tmp_regs[n].reg,
							IR_START_LIVE_POS_FROM_REF(ref) + constraints.tmp_regs[n].start,
							IR_START_LIVE_POS_FROM_REF(ref) + constraints.tmp_regs[n].end);
					}
				}
			} else {
				def_flags = 0;
				constraints.def_reg = IR_REG_NONE;
				constraints.hints_count = 0;
			}

			insn = &ctx->ir_base[ref];
			v = ctx->vregs[ref];
			if (v) {
				IR_ASSERT(ir_bitset_in(live, v));

				if (insn->op != IR_PHI) {
					ir_live_pos def_pos;
					ir_ref hint_ref = 0;
					ir_reg reg = constraints.def_reg;

					if (reg != IR_REG_NONE) {
						def_pos = IR_SAVE_LIVE_POS_FROM_REF(ref);
						if (insn->op == IR_PARAM || insn->op == IR_RLOAD) {
							/* parameter register must be kept before it's copied */
							ir_add_fixed_live_range(ctx, reg, IR_START_LIVE_POS_FROM_REF(bb->start), def_pos);
						}
					} else if (def_flags & IR_DEF_REUSES_OP1_REG) {
						if (!IR_IS_CONST_REF(insn->op1) && ctx->vregs[insn->op1]) {
							hint_ref = insn->op1;
						}
						def_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
					} else if (def_flags & IR_DEF_CONFLICTS_WITH_INPUT_REGS) {
						def_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
					} else {
						if (insn->op == IR_PARAM) {
							/* We may reuse parameter stack slot for spilling */
							ctx->live_intervals[v]->flags |= IR_LIVE_INTERVAL_MEM_PARAM;
						} else if (insn->op == IR_VLOAD) {
							/* Load may be fused into the usage instruction */
							ctx->live_intervals[v]->flags |= IR_LIVE_INTERVAL_MEM_LOAD;
						}
						def_pos = IR_DEF_LIVE_POS_FROM_REF(ref);
					}
					/* live.remove(opd) */
					ir_bitset_excl(live, v);
					/* intervals[opd].setFrom(op.id) */
					ival = ir_fix_live_range(ctx, v,
						IR_START_LIVE_POS_FROM_REF(bb->start), def_pos);
					ival->type = insn->type;
					ir_add_use(ctx, ival, 0, def_pos, reg, def_flags, hint_ref);
				} else {
					/* live.remove(opd) */
					ir_bitset_excl(live, v);
					/* PHIs inputs must not be processed */
					ival = ctx->live_intervals[v];
					if (UNEXPECTED(!ival)) {
						/* Dead PHI */
						ival = ir_add_live_range(ctx, v, IR_DEF_LIVE_POS_FROM_REF(ref), IR_USE_LIVE_POS_FROM_REF(ref));
					}
					ival->type = insn->type;
					ir_add_use(ctx, ival, 0, IR_DEF_LIVE_POS_FROM_REF(ref), IR_REG_NONE, IR_USE_SHOULD_BE_IN_REG, 0);
					continue;
				}
			}

			IR_ASSERT(insn->op != IR_PHI && (!ctx->rules || !(ctx->rules[ref] & (IR_FUSED|IR_SKIPPED))));
			flags = ir_op_flags[insn->op];
			j = 1;
			p = insn->ops + 1;
			if (flags & (IR_OP_FLAG_CONTROL|IR_OP_FLAG_MEM|IR_OP_FLAG_PINNED)) {
				j++;
				p++;
			}
			for (; j <= insn->inputs_count; j++, p++) {
				ir_ref input = *p;
				ir_reg reg = (j < constraints.hints_count) ? constraints.hints[j] : IR_REG_NONE;
				ir_live_pos use_pos;
				ir_ref hint_ref = 0;
				uint32_t v;

				if (input > 0) {
					v = ctx->vregs[input];
					if (v) {
						use_pos = IR_USE_LIVE_POS_FROM_REF(ref);
						if (reg != IR_REG_NONE) {
							use_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
							ir_add_fixed_live_range(ctx, reg, use_pos, use_pos + IR_USE_SUB_REF);
						} else if (def_flags & IR_DEF_REUSES_OP1_REG) {
							if (j == 1) {
								use_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
								IR_ASSERT(ctx->vregs[ref]);
								hint_ref = ref;
							} else if (input == insn->op1) {
								/* Input is the same as "op1" */
								use_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
							}
						}
						if (!ir_bitset_in(live, v)) {
							/* live.add(opd) */
							ir_bitset_incl(live, v);
							/* intervals[opd].addRange(b.from, op.id) */
							ival = ir_add_live_range(ctx, v, IR_START_LIVE_POS_FROM_REF(bb->start), use_pos);
						} else {
							ival = ctx->live_intervals[v];
						}
						ir_add_use(ctx, ival, j, use_pos, reg, IR_USE_FLAGS(def_flags, j), hint_ref);
					} else if (ctx->rules) {
						if (ctx->rules[input] & IR_FUSED) {
						    ir_add_fusion_ranges(ctx, ref, input, bb, live);
						} else if (ctx->rules[input] == (IR_SKIPPED|IR_RLOAD)) {
							ir_set_alocated_reg(ctx, ref, j, ctx->ir_base[input].op2);
						}
					}
				} else if (reg != IR_REG_NONE) {
					use_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
					ir_add_fixed_live_range(ctx, reg, use_pos, use_pos + IR_USE_SUB_REF);
				}
			}
		}

		/* if b is loop header */
		if ((bb->flags & IR_BB_LOOP_HEADER)
		 && !ir_bitset_empty(live, len)) {
			/* variables live at loop header are alive at the whole loop body */
			uint32_t bb_set_len = ir_bitset_len(ctx->cfg_blocks_count + 1);
			uint32_t child;
			ir_block *child_bb;
			ir_bitset child_live_in;

			if (!loops) {
				loops = ir_bitset_malloc(ctx->cfg_blocks_count + 1);
				ir_bitqueue_init(&queue, ctx->cfg_blocks_count + 1);
			} else {
				ir_bitset_clear(loops, bb_set_len);
				ir_bitqueue_clear(&queue);
			}
			ir_bitset_incl(loops, b);
			child = b;
			do {
				child_bb = &ctx->cfg_blocks[child];
				child_live_in = bb_live + (len * child);

				IR_BITSET_FOREACH(live, len, i) {
					ir_bitset_incl(child_live_in, i);
					ir_add_live_range(ctx, i,
						IR_START_LIVE_POS_FROM_REF(child_bb->start),
						IR_END_LIVE_POS_FROM_REF(child_bb->end));
				} IR_BITSET_FOREACH_END();

				child = child_bb->dom_child;
				while (child) {
					child_bb = &ctx->cfg_blocks[child];
					if (child_bb->loop_header && ir_bitset_in(loops, child_bb->loop_header)) {
						ir_bitqueue_add(&queue, child);
						if (child_bb->flags & IR_BB_LOOP_HEADER) {
							ir_bitset_incl(loops, child);
						}
					}
					child = child_bb->dom_next_child;
				}
			} while ((child = ir_bitqueue_pop(&queue)) != (uint32_t)-1);
		}
	}

	if (ctx->entries) {
		for (i = 0; i < ctx->entries_count; i++) {
			b = ctx->entries[i];
			bb = &ctx->cfg_blocks[b];
			live = bb_live + (len * b);
			ir_add_osr_entry_loads(ctx, bb, live, len, b);
		}
		if (ctx->osr_entry_loads) {
			ir_list_push((ir_list*)ctx->osr_entry_loads, 0);
		}
	}

	if (loops) {
		ir_mem_free(loops);
		ir_bitqueue_free(&queue);
	}

	ir_mem_free(bb_live);
#ifdef IR_DEBUG
	ir_mem_free(visited);
#endif

	return 1;
}

#else
/* Path exploration by definition liveness for SSA using sets represented by linked lists */

#define IS_LIVE_IN_BLOCK(v, b) \
	(live_in_block[v] == b)
#define SET_LIVE_IN_BLOCK(v, b) do { \
		live_in_block[v] = b; \
	} while (0)

/* Returns the last virtual register alive at the end of the block (it is used as an already-visited marker) */
IR_ALWAYS_INLINE uint32_t ir_live_out_top(ir_ctx *ctx, uint32_t *live_outs, ir_list *live_lists, uint32_t b)
{
#if 0
	return live_outs[b];
#else
	if (!live_outs[b]) {
		return -1;
	}
	return ir_list_at(live_lists, live_outs[b]);
#endif
}

/* Remember a virtual register alive at the end of the block */
IR_ALWAYS_INLINE void ir_live_out_push(ir_ctx *ctx, uint32_t *live_outs, ir_list *live_lists, uint32_t b, uint32_t v)
{
#if 0
	ir_block *bb = &ctx->cfg_blocks[b];
	live_outs[b] = v;
	ir_add_prev_live_range(ctx, v,
		IR_START_LIVE_POS_FROM_REF(bb->start),
		IR_END_LIVE_POS_FROM_REF(bb->end));
#else
	if (live_lists->len >= live_lists->a.size) {
		ir_array_grow(&live_lists->a, live_lists->a.size + 1024);
	}
	/* Form a linked list of virtual register live at the end of the block */
	ir_list_push_unchecked(live_lists, live_outs[b]); /* push old root of the list (previous element of the list) */
	live_outs[b] = ir_list_len(live_lists);           /* remember the new root */
	ir_list_push_unchecked(live_lists, v);            /* push a virtual register */
#endif
}

/*
 * Computes live-out sets for each basic-block per variable using def-use chains.
 *
 * The implementation is based on algorithms 6 and 7 desriebed in
 * "Computing Liveness Sets for SSA-Form Programs", Florian Brandner, Benoit Boissinot.
 * Alain Darte, Benoit Dupont de Dinechin, Fabrice Rastello. TR Inria RR-7503, 2011
 */
static void ir_compute_live_sets(ir_ctx *ctx, uint32_t *live_outs, ir_list *live_lists)
{
	ir_list block_queue, fuse_queue;
	ir_ref i;

	ir_list_init(&fuse_queue, 16);
	ir_list_init(&block_queue, 256);

	/* For each virtual register explore paths from all uses to definition */
	for (i = ctx->insns_count - 1; i > 0; i--) {
		uint32_t v = ctx->vregs[i];

		if (v) {
			uint32_t def_block = ctx->cfg_map[i];
			ir_use_list *use_list = &ctx->use_lists[i];
			ir_ref *p, n = use_list->count;

			/* Collect all blocks where 'v' is used into a 'block_queue' */
			for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
				ir_ref use = *p;
				ir_insn *insn = &ctx->ir_base[use];

				if (UNEXPECTED(insn->op == IR_PHI)) {
					ir_ref n = insn->inputs_count - 1;
					ir_ref *p = insn->ops + 2; /* PHI data inputs */
					ir_ref *q = ctx->ir_base[insn->op1].ops + 1; /* MERGE inputs */

					for (;n > 0; p++, q++, n--) {
						if (*p == i) {
							uint32_t pred_block = ctx->cfg_map[*q];

							if (ir_live_out_top(ctx, live_outs, live_lists, pred_block) != v) {
								ir_live_out_push(ctx, live_outs, live_lists, pred_block, v);
								if (pred_block != def_block) {
									ir_list_push(&block_queue, pred_block);
								}
							}
						}
					}
				} else if (ctx->rules && UNEXPECTED(ctx->rules[use] & IR_FUSED)) {
					while (1) {
						ir_use_list *use_list = &ctx->use_lists[use];
						ir_ref *p, n = use_list->count;

						for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
							ir_ref use = *p;

							if (ctx->rules[use] & IR_FUSED) {
								ir_list_push(&fuse_queue, use);
							} else {
								uint32_t use_block = ctx->cfg_map[use];

								if (def_block != use_block && ir_live_out_top(ctx, live_outs, live_lists, use_block) != v) {
									ir_list_push(&block_queue, use_block);
								}
							}
						}
						if (!ir_list_len(&fuse_queue)) {
							break;
						}
						use = ir_list_pop(&fuse_queue);
					}
				} else {
					uint32_t use_block = ctx->cfg_map[use];

					/* Check if the virtual register is alive at the start of 'use_block' */
					if (def_block != use_block && ir_live_out_top(ctx, live_outs, live_lists, use_block) != v) {
						ir_list_push(&block_queue, use_block);
					}
				}
			}

			/* UP_AND_MARK: Traverse through predecessor blocks until we reach the block where 'v' is defined*/
			while (ir_list_len(&block_queue)) {
				uint32_t b = ir_list_pop(&block_queue);
				ir_block *bb = &ctx->cfg_blocks[b];
				uint32_t *p, n = bb->predecessors_count;

				if (bb->flags & IR_BB_ENTRY) {
					/* live_in_push(ENTRY, v) */
					ir_insn *insn = &ctx->ir_base[bb->start];

					IR_ASSERT(insn->op == IR_ENTRY);
					IR_ASSERT(insn->op3 >= 0 && insn->op3 < (ir_ref)ctx->entries_count);
					if (live_lists->len >= live_lists->a.size) {
						ir_array_grow(&live_lists->a, live_lists->a.size + 1024);
					}
					ir_list_push_unchecked(live_lists, live_outs[ctx->cfg_blocks_count + 1 + insn->op3]);
					ir_list_push_unchecked(live_lists, v);
					live_outs[ctx->cfg_blocks_count + 1 + insn->op3] = ir_list_len(live_lists) - 1;
					continue;
				}
				for (p = &ctx->cfg_edges[bb->predecessors]; n > 0; p++, n--) {
					uint32_t pred_block = *p;

					/* Check if 'pred_block' wasn't traversed before */
					if (ir_live_out_top(ctx, live_outs, live_lists, pred_block) != v) {
						/* Mark a virtual register 'v' alive at the end of 'pred_block' */
						ir_live_out_push(ctx, live_outs, live_lists, pred_block, v);
						if (pred_block != def_block) {
							ir_list_push(&block_queue, pred_block);
						}
					}
				}
			}
		}
	}

	ir_list_free(&block_queue);
	ir_list_free(&fuse_queue);
}

static void ir_add_osr_entry_loads(ir_ctx *ctx, ir_block *bb, uint32_t pos, ir_list *live_lists, uint32_t b)
{
	bool ok = 1;
	int count = 0;
	ir_list *list = (ir_list*)ctx->osr_entry_loads;
	ir_ref i;

	while (pos) {
		i = ir_list_at(live_lists, pos);
		pos = ir_list_at(live_lists, pos - 1);

		/* Skip live references from ENTRY to PARAM. TODO: duplicate PARAM in each ENTRY ??? */
		ir_use_pos *use_pos = ctx->live_intervals[i]->use_pos;
		ir_ref ref = (use_pos->hint_ref < 0) ? -use_pos->hint_ref : IR_LIVE_POS_TO_REF(use_pos->pos);

		if (use_pos->op_num) {
			ir_ref *ops = ctx->ir_base[ref].ops;
			ref = ops[use_pos->op_num];
		}

		if (ctx->ir_base[ref].op == IR_PARAM) {
			continue;
		}
		if (ctx->binding) {
			ir_ref var = ir_binding_find(ctx, ref);
			if (var < 0) {
				/* We may load the value at OSR entry-point */
				if (!count) {
					bb->flags &= ~IR_BB_EMPTY;
					bb->flags |= IR_BB_OSR_ENTRY_LOADS;
					if (!ctx->osr_entry_loads) {
						list = ctx->osr_entry_loads = ir_mem_malloc(sizeof(ir_list));
						ir_list_init(list, 16);
					}
					ir_list_push(list, b);
					ir_list_push(list, 0);
				}
				ir_list_push(list, ref);
				count++;
				continue;
			}
		}
		fprintf(stderr, "ENTRY %d (block %d start %d) - live var %d\n", ctx->ir_base[bb->start].op2, b, bb->start, ref);
		ok = 0;
	}

	if (!ok) {
		IR_ASSERT(0);
	}
	if (count) {
		ir_list_set(list, ir_list_len(ctx->osr_entry_loads) - (count + 1), count);

#if 0
		/* ENTRY "clobbers" all registers */
		ir_ref ref = ctx->ir_base[bb->start].op1;
		ir_add_fixed_live_range(ctx, IR_REG_ALL,
			IR_DEF_LIVE_POS_FROM_REF(ref),
			IR_SAVE_LIVE_POS_FROM_REF(ref));
#endif
	}
}

static void ir_add_fusion_ranges(ir_ctx *ctx, ir_ref ref, ir_ref input, ir_block *bb, uint32_t *live_in_block, uint32_t b)
{
	ir_ref stack[4];
	int stack_pos = 0;
	ir_target_constraints constraints;
	ir_insn *insn;
	uint32_t j, n, flags, def_flags;
	ir_ref *p, child;
	uint8_t use_flags;
	ir_reg reg;
	ir_live_pos pos = IR_START_LIVE_POS_FROM_REF(ref);
	ir_live_pos use_pos;
	ir_live_interval *ival;

	while (1) {
		IR_ASSERT(input > 0 && ctx->rules[input] & IR_FUSED);

		if (!(ctx->rules[input] & IR_SIMPLE)) {
			def_flags = ir_get_target_constraints(ctx, input, &constraints);
			n = constraints.tmps_count;
			while (n > 0) {
				n--;
				if (constraints.tmp_regs[n].type) {
					ir_add_tmp(ctx, ref, input, constraints.tmp_regs[n].num, constraints.tmp_regs[n]);
				} else {
					/* CPU specific constraints */
					ir_add_fixed_live_range(ctx, constraints.tmp_regs[n].reg,
						pos + constraints.tmp_regs[n].start,
						pos + constraints.tmp_regs[n].end);
				}
			}
		} else {
			def_flags = IR_OP1_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
			constraints.hints_count = 0;
		}

		insn = &ctx->ir_base[input];
		flags = ir_op_flags[insn->op];
		IR_ASSERT(!IR_OP_HAS_VAR_INPUTS(flags));
		n = IR_INPUT_EDGES_COUNT(flags);
		j = 1;
		p = insn->ops + j;
		if (flags & (IR_OP_FLAG_CONTROL|IR_OP_FLAG_PINNED)) {
			j++;
			p++;
		}
		for (; j <= n; j++, p++) {
			IR_ASSERT(IR_OPND_KIND(flags, j) == IR_OPND_DATA);
			child = *p;
			if (child > 0) {
				uint32_t v = ctx->vregs[child];

				if (v) {
					reg = (j < constraints.hints_count) ? constraints.hints[j] : IR_REG_NONE;
					use_pos = pos;
					if (EXPECTED(reg == IR_REG_NONE)) {
						use_pos += IR_USE_SUB_REF;
					}

					if (!IS_LIVE_IN_BLOCK(v, b)) {
						/* live.add(opd) */
						SET_LIVE_IN_BLOCK(v, b);
						/* intervals[opd].addRange(b.from, op.id) */
						ival = ir_add_live_range(ctx, v,
							IR_START_LIVE_POS_FROM_REF(bb->start), use_pos);
					} else {
						ival = ctx->live_intervals[v];
					}
					use_flags = IR_FUSED_USE | IR_USE_FLAGS(def_flags, j);
					ir_add_use(ctx, ival, j, use_pos, reg, use_flags, -input);
				} else if (ctx->rules[child] & IR_FUSED) {
					IR_ASSERT(stack_pos < (int)(sizeof(stack)/sizeof(stack_pos)));
					stack[stack_pos++] = child;
				} else if (ctx->rules[child] == (IR_SKIPPED|IR_RLOAD)) {
					ir_set_alocated_reg(ctx, input, j, ctx->ir_base[child].op2);
				}
			}
		}
		if (!stack_pos) {
			break;
		}
		input = stack[--stack_pos];
	}
}

int ir_compute_live_ranges(ir_ctx *ctx)
{
	uint32_t b, i, j, k, n, succ;
	ir_ref ref;
	ir_insn *insn;
	ir_block *bb, *succ_bb;
	uint32_t *live_outs;
	uint32_t *live_in_block;
	ir_list live_lists;
	ir_live_interval *ival;

	if (!(ctx->flags2 & IR_LINEAR) || !ctx->vregs) {
		return 0;
	}

	if (ctx->rules) {
		ctx->regs = ir_mem_malloc(sizeof(ir_regs) * ctx->insns_count);
		memset(ctx->regs, IR_REG_NONE, sizeof(ir_regs) * ctx->insns_count);
	}

	/* Root of the list of IR_VARs */
	ctx->vars = IR_UNUSED;

	/* Compute Live Ranges */
	ctx->flags2 &= ~IR_LR_HAVE_DESSA_MOVES;

	/* vregs + tmp + fixed + SRATCH + ALL */
	ctx->live_intervals = ir_mem_calloc(ctx->vregs_count + 1 + IR_REG_NUM + 2, sizeof(ir_live_interval*));

    if (!ctx->arena) {
		ctx->arena = ir_arena_create(16 * 1024);
	}

	live_outs = ir_mem_calloc(ctx->cfg_blocks_count + 1 + ctx->entries_count, sizeof(uint32_t));
	ir_list_init(&live_lists, 1024);
	ir_compute_live_sets(ctx, live_outs, &live_lists);
	live_in_block = ir_mem_calloc(ctx->vregs_count + 1, sizeof(uint32_t));

	/* for each basic block in reverse order */
	for (b = ctx->cfg_blocks_count; b > 0; b--) {
		bb = &ctx->cfg_blocks[b];
		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));

		/* For all virtual register alive at the end of the block */
		n = live_outs[b];
		while (n != 0) {
			i = ir_list_at(&live_lists, n);
			SET_LIVE_IN_BLOCK(i, b);
			ir_add_prev_live_range(ctx, i,
				IR_START_LIVE_POS_FROM_REF(bb->start),
				IR_END_LIVE_POS_FROM_REF(bb->end));
			n = ir_list_at(&live_lists, n - 1);
		}

		if (bb->successors_count == 1) {
			/* for each phi function of the successor */
			succ = ctx->cfg_edges[bb->successors];
			succ_bb = &ctx->cfg_blocks[succ];
			if (succ_bb->flags & IR_BB_HAS_PHI) {
				ir_use_list *use_list = &ctx->use_lists[succ_bb->start];
				ir_ref n, *p;

				k = ir_phi_input_number(ctx, succ_bb, b);
				IR_ASSERT(k != 0);
				n = use_list->count;
				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
					ir_ref use = *p;
					insn = &ctx->ir_base[use];
					if (insn->op == IR_PHI) {
						ir_ref input = ir_insn_op(insn, k);
						if (input > 0) {
							uint32_t v = ctx->vregs[input];

							if (v) {
								ival = ctx->live_intervals[v];
								ir_add_phi_use(ctx, ival, k, IR_DEF_LIVE_POS_FROM_REF(bb->end), use);
							}
						}
					}
				}
			}
		}

		/* for each operation of the block in reverse order */
		ref = bb->end;
		insn = &ctx->ir_base[ref];
		if (insn->op == IR_END || insn->op == IR_LOOP_END) {
			ref = ctx->prev_ref[ref];
		}
		for (; ref > bb->start; ref = ctx->prev_ref[ref]) {
			uint32_t def_flags;
			uint32_t flags;
			ir_ref *p;
			ir_target_constraints constraints;
			uint32_t v;

			if (ctx->rules) {
				int n;

				if (ctx->rules[ref] & (IR_FUSED|IR_SKIPPED)) {
					if (((ctx->rules[ref] & IR_RULE_MASK) == IR_VAR
					  || (ctx->rules[ref] & IR_RULE_MASK) == IR_ALLOCA)
					 && ctx->use_lists[ref].count > 0) {
						insn = &ctx->ir_base[ref];
						if (insn->op != IR_VADDR && insn->op != IR_PARAM) {
							insn->op3 = ctx->vars;
							ctx->vars = ref;
						}
					}
					continue;
				}

				def_flags = ir_get_target_constraints(ctx, ref, &constraints);
				n = constraints.tmps_count;
				while (n > 0) {
					n--;
					if (constraints.tmp_regs[n].type) {
						ir_add_tmp(ctx, ref, ref, constraints.tmp_regs[n].num, constraints.tmp_regs[n]);
					} else {
						/* CPU specific constraints */
						ir_add_fixed_live_range(ctx, constraints.tmp_regs[n].reg,
							IR_START_LIVE_POS_FROM_REF(ref) + constraints.tmp_regs[n].start,
							IR_START_LIVE_POS_FROM_REF(ref) + constraints.tmp_regs[n].end);
					}
				}
			} else {
				def_flags = 0;
				constraints.def_reg = IR_REG_NONE;
				constraints.hints_count = 0;
			}

			insn = &ctx->ir_base[ref];
			v = ctx->vregs[ref];
			if (v) {
				if (insn->op != IR_PHI) {
					ir_live_pos def_pos;
					ir_ref hint_ref = 0;
					ir_reg reg = constraints.def_reg;

					if (reg != IR_REG_NONE) {
						def_pos = IR_SAVE_LIVE_POS_FROM_REF(ref);
						if (insn->op == IR_PARAM || insn->op == IR_RLOAD) {
							/* parameter register must be kept before it's copied */
							ir_add_fixed_live_range(ctx, reg, IR_START_LIVE_POS_FROM_REF(bb->start), def_pos);
						}
					} else if (def_flags & IR_DEF_REUSES_OP1_REG) {
						if (!IR_IS_CONST_REF(insn->op1) && ctx->vregs[insn->op1]) {
							hint_ref = insn->op1;
						}
						if (def_flags & IR_DEF_CONFLICTS_WITH_INPUT_REGS) {
							def_pos = IR_USE_LIVE_POS_FROM_REF(ref);
						} else {
							def_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
						}
					} else if (def_flags & IR_DEF_CONFLICTS_WITH_INPUT_REGS) {
						def_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
					} else {
						if (insn->op == IR_PARAM) {
							/* We may reuse parameter stack slot for spilling */
							ctx->live_intervals[v]->flags |= IR_LIVE_INTERVAL_MEM_PARAM;
						} else if (insn->op == IR_VLOAD) {
							/* Load may be fused into the usage instruction */
							ctx->live_intervals[v]->flags |= IR_LIVE_INTERVAL_MEM_LOAD;
						}
						def_pos = IR_DEF_LIVE_POS_FROM_REF(ref);
					}
					/* intervals[opd].setFrom(op.id) */
					ival = ir_fix_live_range(ctx, v,
						IR_START_LIVE_POS_FROM_REF(bb->start), def_pos);
					ival->type = insn->type;
					ir_add_use(ctx, ival, 0, def_pos, reg, def_flags, hint_ref);
				} else {
					/* PHIs inputs must not be processed */
					ival = ctx->live_intervals[v];
					if (UNEXPECTED(!ival)) {
						/* Dead PHI */
						ival = ir_add_live_range(ctx, v, IR_DEF_LIVE_POS_FROM_REF(ref), IR_USE_LIVE_POS_FROM_REF(ref));
					}
					ival->type = insn->type;
					ir_add_use(ctx, ival, 0, IR_DEF_LIVE_POS_FROM_REF(ref), IR_REG_NONE, IR_USE_SHOULD_BE_IN_REG, 0);
					continue;
				}
			}

			IR_ASSERT(insn->op != IR_PHI && (!ctx->rules || !(ctx->rules[ref] & (IR_FUSED|IR_SKIPPED))));
			flags = ir_op_flags[insn->op];
			j = 1;
			p = insn->ops + 1;
			if (flags & (IR_OP_FLAG_CONTROL|IR_OP_FLAG_MEM|IR_OP_FLAG_PINNED)) {
				j++;
				p++;
			}
			for (; j <= insn->inputs_count; j++, p++) {
				ir_ref input = *p;
				ir_reg reg = (j < constraints.hints_count) ? constraints.hints[j] : IR_REG_NONE;
				ir_live_pos use_pos;
				ir_ref hint_ref = 0;
				uint32_t v;

				if (input > 0) {
					v = ctx->vregs[input];
					if (v) {
						use_pos = IR_USE_LIVE_POS_FROM_REF(ref);
						if (reg != IR_REG_NONE) {
							use_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
							ir_add_fixed_live_range(ctx, reg, use_pos, use_pos + IR_USE_SUB_REF);
						} else if (def_flags & IR_DEF_REUSES_OP1_REG) {
							if (j == 1) {
								if (def_flags & IR_DEF_CONFLICTS_WITH_INPUT_REGS) {
									use_pos = IR_USE_LIVE_POS_FROM_REF(ref);
								} else {
									use_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
								}
								IR_ASSERT(ctx->vregs[ref]);
								hint_ref = ref;
							} else if (input == insn->op1) {
								/* Input is the same as "op1" */
								use_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
							}
						}
						if (!IS_LIVE_IN_BLOCK(v, b)) {
							/* live.add(opd) */
							SET_LIVE_IN_BLOCK(v, b);
							/* intervals[opd].addRange(b.from, op.id) */
							ival = ir_add_live_range(ctx, v, IR_START_LIVE_POS_FROM_REF(bb->start), use_pos);
						} else {
							ival = ctx->live_intervals[v];
						}
						ir_add_use(ctx, ival, j, use_pos, reg, IR_USE_FLAGS(def_flags, j), hint_ref);
					} else if (ctx->rules) {
						if (ctx->rules[input] & IR_FUSED) {
						    ir_add_fusion_ranges(ctx, ref, input, bb, live_in_block, b);
						} else {
							if (ctx->rules[input] == (IR_SKIPPED|IR_RLOAD)) {
								ir_set_alocated_reg(ctx, ref, j, ctx->ir_base[input].op2);
							}
							if (reg != IR_REG_NONE) {
								use_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
								ir_add_fixed_live_range(ctx, reg, use_pos, use_pos + IR_USE_SUB_REF);
							}
						}
					}
				} else if (reg != IR_REG_NONE) {
					use_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
					ir_add_fixed_live_range(ctx, reg, use_pos, use_pos + IR_USE_SUB_REF);
				}
			}
		}
	}

	if (ctx->entries) {
		for (i = 0; i < ctx->entries_count; i++) {
			b = ctx->entries[i];
			bb = &ctx->cfg_blocks[b];
			IR_ASSERT(bb->predecessors_count == 1);
			ir_add_osr_entry_loads(ctx, bb, live_outs[ctx->cfg_blocks_count + 1 + i], &live_lists, b);
		}
		if (ctx->osr_entry_loads) {
			ir_list_push((ir_list*)ctx->osr_entry_loads, 0);
		}
	}

	ir_list_free(&live_lists);
	ir_mem_free(live_outs);
	ir_mem_free(live_in_block);

	return 1;
}

#endif

/* Live Ranges coalescing */

static ir_live_pos ir_ivals_overlap(ir_live_range *lrg1, ir_live_range *lrg2)
{
	while (1) {
		if (lrg2->start < lrg1->end) {
			if (lrg1->start < lrg2->end) {
				return IR_MAX(lrg1->start, lrg2->start);
			} else {
				lrg2 = lrg2->next;
				if (!lrg2) {
					return 0;
				}
			}
		} else {
			lrg1 = lrg1->next;
			if (!lrg1) {
				return 0;
			}
		}
	}
}

static ir_live_pos ir_vregs_overlap(ir_ctx *ctx, uint32_t r1, uint32_t r2)
{
	ir_live_interval *ival1 = ctx->live_intervals[r1];
	ir_live_interval *ival2 = ctx->live_intervals[r2];

#if 0
	if (ival2->range.start >= ival1->end
	 || ival1->range.start >= ival2->end) {
		return 0;
	}
#endif
	return ir_ivals_overlap(&ival1->range, &ival2->range);
}

static bool ir_ivals_inside(ir_live_range *parent, ir_live_range *child)
{
	do {
		while (parent && parent->end < child->start) {
			parent = parent->next;
		}
		if (!parent || parent->start > child->start || parent->end < child->end) {
			return 0;
		}
		child = child->next;
	} while (child);
	return 1;
}

static bool ir_vregs_inside(ir_ctx *ctx, uint32_t parent, uint32_t child)
{
	ir_live_interval *child_ival = ctx->live_intervals[child];
	ir_live_interval *parent_ival = ctx->live_intervals[parent];

	if ((child_ival->flags | parent_ival->flags) & IR_LIVE_INTERVAL_COALESCED) {
		// TODO: Support valid cases with already coalesced "parent_ival
		return 0;
	}
#if 0
	if (child_ival->end >= parent_ival->end) {
		return 0;
	}
#endif
	return ir_ivals_inside(&parent_ival->range, &child_ival->range);
}

static void ir_vregs_join(ir_ctx *ctx, uint32_t r1, uint32_t r2)
{
	ir_live_interval *ival = ctx->live_intervals[r2];
	ir_live_range *live_range = &ival->range;
	ir_live_range *next;
	ir_use_pos *use_pos, *next_pos, **prev;

#if 0
	fprintf(stderr, "COALESCE %d -> %d\n", r2, r1);
#endif

	ir_add_live_range(ctx, r1, live_range->start, live_range->end);
	live_range = live_range->next;
	while (live_range) {
		next = live_range->next;
		live_range->next = ctx->unused_ranges;
		ctx->unused_ranges = live_range;
		ir_add_live_range(ctx, r1, live_range->start, live_range->end);
		live_range = next;
	}

	/* merge sorted use_pos lists */
	prev = &ctx->live_intervals[r1]->use_pos;
	use_pos = ival->use_pos;
	while (use_pos) {
		if (use_pos->hint_ref > 0 && ctx->vregs[use_pos->hint_ref] == r1) {
			use_pos->hint_ref = 0;
		}
		while (*prev && ((*prev)->pos < use_pos->pos ||
			((*prev)->pos == use_pos->pos &&
				(use_pos->op_num == 0 || (*prev)->op_num < use_pos->op_num)))) {
			if ((*prev)->hint_ref > 0 && ctx->vregs[(*prev)->hint_ref] == r2) {
				(*prev)->hint_ref = 0;
			}
			prev = &(*prev)->next;
		}
		next_pos = use_pos->next;
		use_pos->next = *prev;
		*prev = use_pos;
		prev = &use_pos->next;
	    use_pos = next_pos;
	}
	use_pos = *prev;
	while (use_pos) {
		if (use_pos->hint_ref > 0 && ctx->vregs[use_pos->hint_ref] == r2) {
			use_pos->hint_ref = 0;
		}
		use_pos = use_pos->next;
	}

	ctx->live_intervals[r1]->flags |=
		IR_LIVE_INTERVAL_COALESCED | (ival->flags & (IR_LIVE_INTERVAL_HAS_HINT_REGS|IR_LIVE_INTERVAL_HAS_HINT_REFS));
	if (ctx->ir_base[IR_LIVE_POS_TO_REF(ctx->live_intervals[r1]->use_pos->pos)].op != IR_VLOAD) {
		ctx->live_intervals[r1]->flags &= ~IR_LIVE_INTERVAL_MEM_LOAD;
	}
	if (ival->flags & IR_LIVE_INTERVAL_MEM_PARAM) {
		IR_ASSERT(!(ctx->live_intervals[r1]->flags & IR_LIVE_INTERVAL_MEM_PARAM));
		ctx->live_intervals[r1]->flags |= IR_LIVE_INTERVAL_MEM_PARAM;
	}
	ctx->live_intervals[r2] = NULL;

	// TODO: remember to reuse ???
	//ir_mem_free(ival);
}

static void ir_vregs_coalesce(ir_ctx *ctx, uint32_t v1, uint32_t v2, ir_ref from, ir_ref to)
{
	ir_ref i;
	uint16_t f1 = ctx->live_intervals[v1]->flags;
	uint16_t f2 = ctx->live_intervals[v2]->flags;

#if 0
	if (ctx->binding) {
		ir_ref b1 = ir_binding_find(ctx, from);
		ir_ref b2 = ir_binding_find(ctx, to);
		IR_ASSERT(b1 == b2);
	}
#endif
	if ((f1 & IR_LIVE_INTERVAL_COALESCED) && !(f2 & IR_LIVE_INTERVAL_COALESCED)) {
		ir_vregs_join(ctx, v1, v2);
		ctx->vregs[to] = v1;
	} else if ((f2 & IR_LIVE_INTERVAL_COALESCED) && !(f1 & IR_LIVE_INTERVAL_COALESCED)) {
		ir_vregs_join(ctx, v2, v1);
		ctx->vregs[from] = v2;
	} else if (from < to) {
		ir_vregs_join(ctx, v1, v2);
		if (f2 & IR_LIVE_INTERVAL_COALESCED) {
			for (i = 1; i < ctx->insns_count; i++) {
				if (ctx->vregs[i] == v2) {
					ctx->vregs[i] = v1;
				}
			}
		} else {
			ctx->vregs[to] = v1;
		}
	} else {
		ir_vregs_join(ctx, v2, v1);
		if (f1 & IR_LIVE_INTERVAL_COALESCED) {
			for (i = 1; i < ctx->insns_count; i++) {
				if (ctx->vregs[i] == v1) {
					ctx->vregs[i] = v2;
				}
			}
		} else {
			ctx->vregs[from] = v2;
		}
	}
}

static void ir_add_phi_move(ir_ctx *ctx, uint32_t b, ir_ref from, ir_ref to)
{
	if (IR_IS_CONST_REF(from) || ctx->vregs[from] != ctx->vregs[to]) {
		ctx->cfg_blocks[b].flags &= ~IR_BB_EMPTY;
		ctx->cfg_blocks[b].flags |= IR_BB_DESSA_MOVES;
		ctx->flags2 |= IR_LR_HAVE_DESSA_MOVES;
#if 0
		fprintf(stderr, "BB%d: MOV %d -> %d\n", b, from, to);
#endif
	}
}

typedef struct _ir_coalesce_block {
	uint32_t b;
	uint32_t loop_depth;
} ir_coalesce_block;

static int ir_block_cmp(const void *b1, const void *b2)
{
	ir_coalesce_block *d1 = (ir_coalesce_block*)b1;
	ir_coalesce_block *d2 = (ir_coalesce_block*)b2;

	if (d1->loop_depth > d2->loop_depth) {
		return -1;
	} else if (d1->loop_depth == d2->loop_depth) {
		if (d1->b < d2->b) {
			return -1;
		} else {
			return 1;
		}
	} else {
		return 1;
	}
}

static void ir_swap_operands(ir_ctx *ctx, ir_ref i, ir_insn *insn)
{
	ir_live_pos pos = IR_USE_LIVE_POS_FROM_REF(i);
	ir_live_pos load_pos = IR_LOAD_LIVE_POS_FROM_REF(i);
	ir_live_interval *ival;
	ir_live_range *r;
	ir_use_pos *p, *p1 = NULL, *p2 = NULL;
	ir_ref tmp;

	tmp = insn->op1;
	insn->op1 = insn->op2;
	insn->op2 = tmp;

	ival = ctx->live_intervals[ctx->vregs[insn->op1]];
	p = ival->use_pos;
	while (p) {
		if (p->pos == pos) {
			p->pos = load_pos;
			p->op_num = 1;
			p1 = p;
			break;
		}
		p = p->next;
	}

	ival = ctx->live_intervals[ctx->vregs[i]];
	p = ival->use_pos;
	while (p) {
		if (p->pos == load_pos) {
			p->hint_ref = insn->op1;
			break;
		}
		p = p->next;
	}

	if (insn->op2 > 0 && ctx->vregs[insn->op2]) {
		ival = ctx->live_intervals[ctx->vregs[insn->op2]];
		r = &ival->range;
		while (r) {
			if (r->end == load_pos) {
				r->end = pos;
				if (!r->next) {
					ival->end = pos;
				}
				break;
			}
			r = r->next;
		}
		p = ival->use_pos;
		while (p) {
			if (p->pos == load_pos) {
				p->pos = pos;
				p->op_num = 2;
				p2 = p;
				break;
			}
			p = p->next;
		}
	}
	if (p1 && p2) {
		uint8_t tmp = p1->flags;
		p1->flags = p2->flags;
		p2->flags = tmp;
	}
}

static int ir_hint_conflict(ir_ctx *ctx, ir_ref ref, int use, int def)
{
	ir_use_pos *p;
	ir_reg r1 = IR_REG_NONE;
	ir_reg r2 = IR_REG_NONE;

	p = ctx->live_intervals[use]->use_pos;
	while (p) {
		if (IR_LIVE_POS_TO_REF(p->pos) == ref) {
			break;
		}
		if (p->hint != IR_REG_NONE) {
			r1 = p->hint;
		}
		p = p->next;
	}

	p = ctx->live_intervals[def]->use_pos;
	while (p) {
		if (IR_LIVE_POS_TO_REF(p->pos) > ref) {
			if (p->hint != IR_REG_NONE) {
				r2 = p->hint;
				break;
			}
		}
		p = p->next;
	}
	return r1 != r2 && r1 != IR_REG_NONE && r2 != IR_REG_NONE;
}

static int ir_try_swap_operands(ir_ctx *ctx, ir_ref i, ir_insn *insn)
{
	if (ctx->vregs[insn->op1]
	 && ctx->vregs[insn->op1] != ctx->vregs[i]
	 && !ir_vregs_overlap(ctx, ctx->vregs[insn->op1], ctx->vregs[i])
	 && !ir_hint_conflict(ctx, i, ctx->vregs[insn->op1], ctx->vregs[i])) {
		/* pass */
	} else {
		if (ctx->vregs[insn->op2] && ctx->vregs[insn->op2] != ctx->vregs[i]) {
			ir_live_pos pos = IR_USE_LIVE_POS_FROM_REF(i);
			ir_live_pos load_pos = IR_LOAD_LIVE_POS_FROM_REF(i);
			ir_live_interval *ival = ctx->live_intervals[ctx->vregs[insn->op2]];
			ir_live_range *r = &ival->range;

			if ((ival->flags & IR_LIVE_INTERVAL_MEM_PARAM) && ctx->use_lists[insn->op2].count == 1) {
				return 0;
			}
			while (r) {
				if (r->end == pos) {
					r->end = load_pos;
					if (!r->next) {
						ival->end = load_pos;
					}
					if (!ir_vregs_overlap(ctx, ctx->vregs[insn->op2], ctx->vregs[i])
					 && !ir_hint_conflict(ctx, i, ctx->vregs[insn->op2], ctx->vregs[i])) {
						ir_swap_operands(ctx, i, insn);
						return 1;
					} else {
						r->end = pos;
						if (!r->next) {
							ival->end = pos;
						}
					}
					break;
				}
				r = r->next;
			}
		}
	}
	return 0;
}

int ir_coalesce(ir_ctx *ctx)
{
	uint32_t b, n, succ, pred_b, count = 0;
	ir_ref *p, use, input, k, j;
	ir_block *bb, *succ_bb;
	ir_use_list *use_list;
	ir_insn *insn;
	ir_bitset visited;
	ir_coalesce_block *list;
	bool compact = 0;

	/* Collect a list of blocks which are predecossors to block with phi functions */
	list = ir_mem_malloc(sizeof(ir_coalesce_block) * ctx->cfg_blocks_count);
	visited = ir_bitset_malloc(ctx->cfg_blocks_count + 1);
	for (b = 1, bb = &ctx->cfg_blocks[1]; b <= ctx->cfg_blocks_count; b++, bb++) {
		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
		if (bb->flags & IR_BB_HAS_PHI) {
			k = bb->predecessors_count;
			if (k > 1) {
				use_list = &ctx->use_lists[bb->start];
				n = use_list->count;
				IR_ASSERT(k == ctx->ir_base[bb->start].inputs_count);
				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
					use = *p;
					insn = &ctx->ir_base[use];
					if (insn->op == IR_PHI) {
						do {
							k--;
							pred_b = ctx->cfg_edges[bb->predecessors + k];
							if (!ir_bitset_in(visited, pred_b)) {
								ir_bitset_incl(visited, pred_b);
								list[count].b = pred_b;
								list[count].loop_depth = ctx->cfg_blocks[pred_b].loop_depth;
								count++;
							}
						} while (k > 0);
						break;
					}
				}
			}
		}
	}
	ir_mem_free(visited);

	/* Sort blocks according to their loop depth */
	qsort(list, count, sizeof(ir_coalesce_block), ir_block_cmp);

	while (count > 0) {

		count--;
		b = list[count].b;
		bb = &ctx->cfg_blocks[b];
		IR_ASSERT(bb->successors_count == 1);
		succ = ctx->cfg_edges[bb->successors];
		succ_bb = &ctx->cfg_blocks[succ];
		IR_ASSERT(succ_bb->predecessors_count > 1);
		k = ir_phi_input_number(ctx, succ_bb, b);
		use_list = &ctx->use_lists[succ_bb->start];
		n = use_list->count;
		for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
			use = *p;
			insn = &ctx->ir_base[use];
			if (insn->op == IR_PHI) {
				input = ir_insn_op(insn, k);
				if (input > 0 && ctx->vregs[input]) {
					uint32_t v1 = ctx->vregs[input];
					uint32_t v2 = ctx->vregs[use];

					if (v1 == v2) {
						/* already coalesced */
					} else {
						if (!ir_vregs_overlap(ctx, v1, v2)) {
							ir_vregs_coalesce(ctx, v1, v2, input, use);
							compact = 1;
						} else {
#if 1
							if (ctx->rules && (ctx->rules[input] & IR_MAY_SWAP)) {
								ir_insn *input_insn = &ctx->ir_base[input];

								IR_ASSERT(ir_op_flags[input_insn->op] & IR_OP_FLAG_COMMUTATIVE);
								if (input_insn->op2 == use
								 && input_insn->op1 != use
								 && (ctx->live_intervals[v1]->use_pos->flags & IR_DEF_REUSES_OP1_REG)) {
									ir_live_range *r = &ctx->live_intervals[v2]->range;

									do {
										if (r->end == IR_USE_LIVE_POS_FROM_REF(input)) {
											break;
										}
										r = r->next;
									} while (r);
									if (r) {
										r->end = IR_LOAD_LIVE_POS_FROM_REF(input);
										if (!r->next) {
											ctx->live_intervals[v2]->end = IR_LOAD_LIVE_POS_FROM_REF(input);
										}
										if (ir_vregs_overlap(ctx, v1, v2)) {
											r->end = IR_USE_LIVE_POS_FROM_REF(input);
											if (!r->next) {
												ctx->live_intervals[v2]->end = IR_USE_LIVE_POS_FROM_REF(input);
											}
										} else {
											ir_swap_operands(ctx, input, input_insn);
											IR_ASSERT(!ir_vregs_overlap(ctx, v1, v2));
											ir_vregs_coalesce(ctx, v1, v2, input, use);
											compact = 1;
											continue;
										}
									}
								}
							}
#endif
							ir_add_phi_move(ctx, b, input, use);
						}
					}
				} else {
					/* Move for constant input */
					ir_add_phi_move(ctx, b, input, use);
				}
			}
		}
	}
	ir_mem_free(list);

	ir_hint_propagation(ctx);

	if (ctx->rules) {
		/* try to swap operands of commutative instructions for better register allocation */
		uint32_t *rule = ctx->rules + 1;
		ir_ref i;

		for (i = 1; i < ctx->insns_count; rule++, i++) {
			if ((*rule) & (IR_MAY_SWAP|IR_MAY_REUSE)) {
				insn = &ctx->ir_base[i];
				IR_ASSERT(ctx->vregs[i]);
				if ((*rule) & IR_MAY_SWAP) {
					IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_COMMUTATIVE);
					if (ctx->live_intervals[ctx->vregs[i]]->use_pos
					 && (ctx->live_intervals[ctx->vregs[i]]->use_pos->flags & IR_DEF_REUSES_OP1_REG)
					 && insn->op2 > 0
					 && insn->op1 > 0
					 && insn->op1 != insn->op2) {
						ir_try_swap_operands(ctx, i, insn);
					}
				} else {
					IR_ASSERT((*rule) & IR_MAY_REUSE);
					if (insn->op1 > 0
					 && ctx->vregs[insn->op1]
					 && ctx->vregs[i] != ctx->vregs[insn->op1]) {
						if (ir_vregs_inside(ctx, ctx->vregs[insn->op1], ctx->vregs[i])) {
							if (ctx->binding) {
								ir_ref b1 = ir_binding_find(ctx, i);
								ir_ref b2 = ir_binding_find(ctx, insn->op1);
								if (b1 && b1 != b2) {
									continue;
								}
							}
							ir_vregs_coalesce(ctx, ctx->vregs[i], ctx->vregs[insn->op1], i, insn->op1);
							compact = 1;
						}
					}
				}
			}
		}
	}

	if (compact) {
		ir_ref i, n;
		uint32_t *xlat = ir_mem_malloc((ctx->vregs_count + 1) * sizeof(uint32_t));

		for (i = 1, n = 1; i <= ctx->vregs_count; i++) {
			if (ctx->live_intervals[i]) {
				xlat[i] = n;
				if (i != n) {
					ctx->live_intervals[n] = ctx->live_intervals[i];
					ctx->live_intervals[n]->vreg = n;
				}
				n++;
			}
		}
		n--;
		if (n != ctx->vregs_count) {
			j = ctx->vregs_count - n;
			/* vregs + tmp + fixed + SRATCH + ALL */
			for (i = n + 1; i <= n + IR_REG_NUM + 2; i++) {
				ctx->live_intervals[i] = ctx->live_intervals[i + j];
				if (ctx->live_intervals[i]) {
					ctx->live_intervals[i]->vreg = i;
				}
			}
			for (j = 1; j < ctx->insns_count; j++) {
				if (ctx->vregs[j]) {
					ctx->vregs[j] = xlat[ctx->vregs[j]];
				}
			}
			ctx->vregs_count = n;
		}
		ir_mem_free(xlat);
	}

	return 1;
}

/* SSA Deconstruction */

int ir_compute_dessa_moves(ir_ctx *ctx)
{
	uint32_t b, n;
	ir_ref j, k, *p, use;
	ir_block *bb;
	ir_use_list *use_list;
	ir_insn *insn;

	for (b = 1, bb = &ctx->cfg_blocks[1]; b <= ctx->cfg_blocks_count; b++, bb++) {
		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
		k = bb->predecessors_count;
		if (k > 1) {
			use_list = &ctx->use_lists[bb->start];
			n = use_list->count;
			if (n > 1) {
				IR_ASSERT(k == ctx->ir_base[bb->start].inputs_count);
				k++;
				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
					use = *p;
					insn = &ctx->ir_base[use];
					if (insn->op == IR_PHI) {
						for (j = 2; j <= k; j++) {
							if (IR_IS_CONST_REF(ir_insn_op(insn, j)) || ctx->vregs[ir_insn_op(insn, j)] != ctx->vregs[use]) {
								int pred = ctx->cfg_edges[bb->predecessors + (j-2)];
								ctx->cfg_blocks[pred].flags &= ~IR_BB_EMPTY;
								ctx->cfg_blocks[pred].flags |= IR_BB_DESSA_MOVES;
								ctx->flags2 |= IR_LR_HAVE_DESSA_MOVES;
							}
						}
					}
				}
			}
		}
	}
	return 1;
}

/*
 * Parallel copy sequentialization algorithm
 *
 * The implementation is based on algorithm 1 desriebed in
 * "Revisiting Out-of-SSA Translation for Correctness, Code Quality and Efficiency",
 * Benoit Boissinot, Alain Darte, Fabrice Rastello, Benoit Dupont de Dinechin, Christophe Guillon.
 * 2009 International Symposium on Code Generation and Optimization, Seattle, WA, USA, 2009,
 * pp. 114-125, doi: 10.1109/CGO.2009.19.
 */
int ir_gen_dessa_moves(ir_ctx *ctx, uint32_t b, emit_copy_t emit_copy)
{
	uint32_t succ, k, n = 0;
	ir_block *bb, *succ_bb;
	ir_use_list *use_list;
	ir_ref *loc, *pred, *src, *dst, i, *p, ref, input;
	ir_ref s, d;
	ir_insn *insn;
	uint32_t len;
	ir_bitset todo, ready;
	bool have_constants_or_addresses = 0;

	bb = &ctx->cfg_blocks[b];
	if (!(bb->flags & IR_BB_DESSA_MOVES)) {
		return 0;
	}
	IR_ASSERT(bb->successors_count == 1);
	succ = ctx->cfg_edges[bb->successors];
	succ_bb = &ctx->cfg_blocks[succ];
	IR_ASSERT(succ_bb->predecessors_count > 1);
	use_list = &ctx->use_lists[succ_bb->start];

	k = ir_phi_input_number(ctx, succ_bb, b);

	loc = ir_mem_malloc((ctx->vregs_count + 1) * 4 * sizeof(ir_ref));
	pred = loc + ctx->vregs_count + 1;
	src = pred + ctx->vregs_count + 1;
	dst = src + ctx->vregs_count + 1;
	len = ir_bitset_len(ctx->vregs_count + 1);
	todo = ir_bitset_malloc(ctx->vregs_count + 1);

	for (i = use_list->count, p = &ctx->use_edges[use_list->refs]; i > 0; p++, i--) {
		ref = *p;
		insn = &ctx->ir_base[ref];
		if (insn->op == IR_PHI) {
			input = ir_insn_op(insn, k);
			if (IR_IS_CONST_REF(input) || !ctx->vregs[input]) {
				have_constants_or_addresses = 1;
			} else if (ctx->vregs[input] != ctx->vregs[ref]) {
				s = ctx->vregs[input];
				d = ctx->vregs[ref];
				src[s] = input;
				dst[d] = ref;
				loc[d] = pred[s] = 0;
				ir_bitset_incl(todo, d);
				n++;
			}
		}
	}

	if (n > 0) {
		src[0] = dst[0] = 0;
		ready = ir_bitset_malloc(ctx->vregs_count + 1);
		IR_BITSET_FOREACH(todo, len, d) {
			ref = dst[d];
			insn = &ctx->ir_base[ref];
			IR_ASSERT(insn->op == IR_PHI);
			input = ir_insn_op(insn, k);
			s = ctx->vregs[input];
			loc[s] = s;
			pred[d] = s;
		} IR_BITSET_FOREACH_END();

		IR_BITSET_FOREACH(todo, len, i) {
			if (!loc[i]) {
				ir_bitset_incl(ready, i);
			}
		} IR_BITSET_FOREACH_END();

		while (1) {
			ir_ref a, b, c;

			while ((b = ir_bitset_pop_first(ready, len)) >= 0) {
				a = pred[b];
				c = loc[a];
				emit_copy(ctx, ctx->ir_base[dst[b]].type, src[c], dst[b]);
				ir_bitset_excl(todo, b);
				loc[a] = b;
				src[b] = dst[b];
				if (a == c && pred[a]) {
					ir_bitset_incl(ready, a);
				}
			}
			b = ir_bitset_pop_first(todo, len);
			if (b < 0) {
				break;
			}
			IR_ASSERT(b != loc[pred[b]]);
			emit_copy(ctx, ctx->ir_base[src[b]].type, src[b], 0);
			loc[b] = 0;
			ir_bitset_incl(ready, b);
		}

		ir_mem_free(ready);
	}

	ir_mem_free(todo);
	ir_mem_free(loc);

	if (have_constants_or_addresses) {
		for (i = use_list->count, p = &ctx->use_edges[use_list->refs]; i > 0; p++, i--) {
			ref = *p;
			insn = &ctx->ir_base[ref];
			if (insn->op == IR_PHI) {
				input = ir_insn_op(insn, k);
				if (IR_IS_CONST_REF(input) || !ctx->vregs[input]) {
					emit_copy(ctx, insn->type, input, ref);
				}
			}
		}
	}

	return 1;
}

/* Linear Scan Register Allocation */

#ifdef IR_DEBUG
# define IR_LOG_LSRA(action, ival, comment) do { \
		if (ctx->flags & IR_DEBUG_RA) { \
			ir_live_interval *_ival = (ival); \
			ir_live_pos _start = _ival->range.start; \
			ir_live_pos _end = _ival->end; \
			fprintf(stderr, action " R%d [%d.%d...%d.%d)" comment "\n", \
				(_ival->flags & IR_LIVE_INTERVAL_TEMP) ? 0 : _ival->vreg, \
				IR_LIVE_POS_TO_REF(_start), IR_LIVE_POS_TO_SUB_REF(_start), \
				IR_LIVE_POS_TO_REF(_end), IR_LIVE_POS_TO_SUB_REF(_end)); \
		} \
	} while (0)
# define IR_LOG_LSRA_ASSIGN(action, ival, comment) do { \
		if (ctx->flags & IR_DEBUG_RA) { \
			ir_live_interval *_ival = (ival); \
			ir_live_pos _start = _ival->range.start; \
			ir_live_pos _end = _ival->end; \
			fprintf(stderr, action " R%d [%d.%d...%d.%d) to %s" comment "\n", \
				(_ival->flags & IR_LIVE_INTERVAL_TEMP) ? 0 : _ival->vreg, \
				IR_LIVE_POS_TO_REF(_start), IR_LIVE_POS_TO_SUB_REF(_start), \
				IR_LIVE_POS_TO_REF(_end), IR_LIVE_POS_TO_SUB_REF(_end), \
				ir_reg_name(_ival->reg, _ival->type)); \
		} \
	} while (0)
# define IR_LOG_LSRA_SPLIT(ival, pos) do { \
		if (ctx->flags & IR_DEBUG_RA) { \
			ir_live_interval *_ival = (ival); \
			ir_live_pos _start = _ival->range.start; \
			ir_live_pos _end = _ival->end; \
			ir_live_pos _pos = (pos); \
			fprintf(stderr, "      ---- Split R%d [%d.%d...%d.%d) at %d.%d\n", \
				(_ival->flags & IR_LIVE_INTERVAL_TEMP) ? 0 : _ival->vreg, \
				IR_LIVE_POS_TO_REF(_start), IR_LIVE_POS_TO_SUB_REF(_start), \
				IR_LIVE_POS_TO_REF(_end), IR_LIVE_POS_TO_SUB_REF(_end), \
				IR_LIVE_POS_TO_REF(_pos), IR_LIVE_POS_TO_SUB_REF(_pos)); \
		} \
	} while (0)
# define IR_LOG_LSRA_CONFLICT(action, ival, pos) do { \
		if (ctx->flags & IR_DEBUG_RA) { \
			ir_live_interval *_ival = (ival); \
			ir_live_pos _start = _ival->range.start; \
			ir_live_pos _end = _ival->end; \
			ir_live_pos _pos = (pos); \
			fprintf(stderr, action " R%d [%d.%d...%d.%d) assigned to %s at %d.%d\n", \
				(_ival->flags & IR_LIVE_INTERVAL_TEMP) ? 0 : _ival->vreg, \
				IR_LIVE_POS_TO_REF(_start), IR_LIVE_POS_TO_SUB_REF(_start), \
				IR_LIVE_POS_TO_REF(_end), IR_LIVE_POS_TO_SUB_REF(_end), \
				ir_reg_name(_ival->reg, _ival->type), \
				IR_LIVE_POS_TO_REF(_pos), IR_LIVE_POS_TO_SUB_REF(_pos)); \
		} \
	} while (0)
#else
# define IR_LOG_LSRA(action, ival, comment)
# define IR_LOG_LSRA_ASSIGN(action, ival, comment)
# define IR_LOG_LSRA_SPLIT(ival, pos)
# define IR_LOG_LSRA_CONFLICT(action, ival, pos);
#endif

static bool ir_ival_covers(ir_live_interval *ival, ir_live_pos position)
{
	ir_live_range *live_range = &ival->range;

	do {
		if (position < live_range->end) {
			return position >= live_range->start;
		}
		live_range = live_range->next;
	} while (live_range);

	return 0;
}

static bool ir_ival_has_hole_between(ir_live_interval *ival, ir_live_pos from, ir_live_pos to)
{
	ir_live_range *r = &ival->range;

	while (r) {
		if (from < r->start) {
			return 1;
		} else if (to <= r->end) {
			return 0;
		}
		r = r->next;
	}
	return 0;
}


static ir_live_pos ir_last_use_pos_before(ir_live_interval *ival, ir_live_pos pos, uint8_t flags)
{
	ir_live_pos ret = 0;
	ir_use_pos *p = ival->use_pos;

	while (p && p->pos <= pos) {
		if (p->flags & flags) {
			ret = p->pos;
		}
		p = p->next;
	}
	return ret;
}

static ir_live_pos ir_first_use_pos_after(ir_live_interval *ival, ir_live_pos pos, uint8_t flags)
{
	ir_use_pos *p = ival->use_pos;

	while (p && p->pos < pos) {
		p = p->next;
	}
	if (p && p->pos == pos && p->op_num != 0) {
		p = p->next;
	}
	while (p && !(p->flags & flags)) {
		p = p->next;
	}
	return p ? p->pos : 0x7fffffff;
}

static ir_live_pos ir_first_use_pos(ir_live_interval *ival, uint8_t flags)
{
	ir_use_pos *p = ival->use_pos;

	while (p && !(p->flags & flags)) {
		p = p->next;
	}
	return p ? p->pos : 0x7fffffff;
}

static ir_block *ir_block_from_live_pos(ir_ctx *ctx, ir_live_pos pos)
{
	ir_ref ref = IR_LIVE_POS_TO_REF(pos);
	uint32_t b = ctx->cfg_map[ref];

	while (!b) {
		ref--;
		IR_ASSERT(ref > 0);
		b = ctx->cfg_map[ref];
	}
	IR_ASSERT(b <= ctx->cfg_blocks_count);
	return &ctx->cfg_blocks[b];
}

static ir_live_pos ir_find_optimal_split_position(ir_ctx *ctx, ir_live_interval *ival, ir_live_pos min_pos, ir_live_pos max_pos, bool prefer_max)
{
	ir_block *min_bb, *max_bb;

	if (min_pos == max_pos) {
		return max_pos;
	}

	IR_ASSERT(min_pos < max_pos);
	IR_ASSERT(min_pos >= ival->range.start);
	IR_ASSERT(max_pos < ival->end);

	min_bb = ir_block_from_live_pos(ctx, min_pos);
	max_bb = ir_block_from_live_pos(ctx, max_pos);

	if (min_bb == max_bb
	 || ir_ival_has_hole_between(ival, min_pos, max_pos)) {  // TODO: ???
		return (prefer_max) ? max_pos : min_pos;
	}

	if (max_bb->loop_depth > 0) {
		/* Split at the end of the loop entry */
		do {
			ir_block *bb;

			if (max_bb->flags & IR_BB_LOOP_HEADER) {
				bb = max_bb;
			} else {
				IR_ASSERT(max_bb->loop_header);
				bb = &ctx->cfg_blocks[max_bb->loop_header];
			}
			bb = &ctx->cfg_blocks[bb->idom];
			if (IR_DEF_LIVE_POS_FROM_REF(bb->end) < min_pos) {
				break;
			}
			max_bb = bb;
		} while (max_bb->loop_depth > 0);

		if (IR_DEF_LIVE_POS_FROM_REF(max_bb->end) < max_pos) {
			return IR_DEF_LIVE_POS_FROM_REF(max_bb->end);
		}
	}

	if (IR_LOAD_LIVE_POS_FROM_REF(max_bb->start) > min_pos) {
		return IR_LOAD_LIVE_POS_FROM_REF(max_bb->start);
	} else {
		// TODO: "min_bb" is in a deeper loop than "max_bb" ???
		return max_pos;
	}
}

static ir_live_interval *ir_split_interval_at(ir_ctx *ctx, ir_live_interval *ival, ir_live_pos pos)
{
	ir_live_interval *child;
	ir_live_range *p, *prev;
	ir_use_pos *use_pos, *prev_use_pos;

	IR_LOG_LSRA_SPLIT(ival, pos);
	IR_ASSERT(pos > ival->range.start);
	ctx->flags2 |= IR_RA_HAVE_SPLITS;

	p = &ival->range;
	prev = NULL;
	while (p && pos >= p->end) {
		prev = p;
		p = prev->next;
	}
	IR_ASSERT(p);

	if (pos < p->start) {
		/* split between ranges */
		pos = p->start;
	}

	use_pos = ival->use_pos;
	prev_use_pos = NULL;

	ival->flags &= ~(IR_LIVE_INTERVAL_HAS_HINT_REGS|IR_LIVE_INTERVAL_HAS_HINT_REFS);
	if (p->start == pos) {
		while (use_pos && pos > use_pos->pos) {
			if (use_pos->hint != IR_REG_NONE) {
				ival->flags |= IR_LIVE_INTERVAL_HAS_HINT_REGS;
			}
			if (use_pos->hint_ref > 0) {
				ival->flags |= IR_LIVE_INTERVAL_HAS_HINT_REFS;
			}
			prev_use_pos = use_pos;
			use_pos = use_pos->next;
		}
	} else {
		while (use_pos && pos >= use_pos->pos) {
			if (use_pos->hint != IR_REG_NONE) {
				ival->flags |= IR_LIVE_INTERVAL_HAS_HINT_REGS;
			}
			if (use_pos->hint_ref > 0) {
				ival->flags |= IR_LIVE_INTERVAL_HAS_HINT_REFS;
			}
			prev_use_pos = use_pos;
			use_pos = use_pos->next;
		}
	}

	child = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
	child->type = ival->type;
	child->reg = IR_REG_NONE;
	child->flags = IR_LIVE_INTERVAL_SPLIT_CHILD;
	child->vreg = ival->vreg;
	child->stack_spill_pos = -1; // not allocated
	child->range.start = pos;
	child->range.end = p->end;
	child->range.next = p->next;
	child->end = ival->end;
	child->use_pos = prev_use_pos ? prev_use_pos->next : use_pos;

	child->next = ival->next;
	ival->next = child;

	if (pos == p->start) {
		prev->next = NULL;
		ival->end = prev->end;
		/* Cache to reuse */
		p->next = ctx->unused_ranges;
		ctx->unused_ranges = p;
	} else {
		p->end = ival->end = pos;
		p->next = NULL;
	}
	if (prev_use_pos) {
		prev_use_pos->next = NULL;
	} else {
		ival->use_pos = NULL;
	}

	use_pos = child->use_pos;
	while (use_pos) {
		if (use_pos->hint != IR_REG_NONE) {
			child->flags |= IR_LIVE_INTERVAL_HAS_HINT_REGS;
		}
		if (use_pos->hint_ref > 0) {
			child->flags |= IR_LIVE_INTERVAL_HAS_HINT_REFS;
		}
		use_pos = use_pos->next;
	}

	return child;
}

static int32_t ir_allocate_small_spill_slot(ir_ctx *ctx, size_t size, ir_reg_alloc_data *data)
{
	int32_t ret;

	IR_ASSERT(size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
	if (data->handled && data->handled[size]) {
		ret = data->handled[size]->stack_spill_pos;
		data->handled[size] = data->handled[size]->list_next;
	} else if (size == 8) {
		ret = ctx->stack_frame_size;
		ctx->stack_frame_size += 8;
	} else if (size == 4) {
		if (data->unused_slot_4) {
			ret = data->unused_slot_4;
			data->unused_slot_4 = 0;
	    } else if (data->handled && data->handled[8]) {
			ret = data->handled[8]->stack_spill_pos;
			data->handled[8] = data->handled[8]->list_next;
			data->unused_slot_4 = ret + 4;
		} else {
			ret = ctx->stack_frame_size;
			if (sizeof(void*) == 8) {
				data->unused_slot_4 = ctx->stack_frame_size + 4;
				ctx->stack_frame_size += 8;
			} else {
				ctx->stack_frame_size += 4;
			}
		}
	} else if (size == 2) {
		if (data->unused_slot_2) {
			ret = data->unused_slot_2;
			data->unused_slot_2 = 0;
		} else if (data->unused_slot_4) {
			ret = data->unused_slot_4;
			data->unused_slot_2 = data->unused_slot_4 + 2;
			data->unused_slot_4 = 0;
	    } else if (data->handled && data->handled[4]) {
			ret = data->handled[4]->stack_spill_pos;
			data->handled[4] = data->handled[4]->list_next;
			data->unused_slot_2 = ret + 2;
	    } else if (data->handled && data->handled[8]) {
			ret = data->handled[8]->stack_spill_pos;
			data->handled[8] = data->handled[8]->list_next;
			data->unused_slot_2 = ret + 2;
			data->unused_slot_4 = ret + 4;
		} else {
			ret = ctx->stack_frame_size;
			data->unused_slot_2 = ctx->stack_frame_size + 2;
			if (sizeof(void*) == 8) {
				data->unused_slot_4 = ctx->stack_frame_size + 4;
				ctx->stack_frame_size += 8;
			} else {
				ctx->stack_frame_size += 4;
			}
		}
	} else if (size == 1) {
		if (data->unused_slot_1) {
			ret = data->unused_slot_1;
			data->unused_slot_1 = 0;
		} else if (data->unused_slot_2) {
			ret = data->unused_slot_2;
			data->unused_slot_1 = data->unused_slot_2 + 1;
			data->unused_slot_2 = 0;
		} else if (data->unused_slot_4) {
			ret = data->unused_slot_4;
			data->unused_slot_1 = data->unused_slot_4 + 1;
			data->unused_slot_2 = data->unused_slot_4 + 2;
			data->unused_slot_4 = 0;
	    } else if (data->handled && data->handled[2]) {
			ret = data->handled[2]->stack_spill_pos;
			data->handled[2] = data->handled[2]->list_next;
			data->unused_slot_1 = ret + 1;
	    } else if (data->handled && data->handled[4]) {
			ret = data->handled[4]->stack_spill_pos;
			data->handled[4] = data->handled[4]->list_next;
			data->unused_slot_1 = ret + 1;
			data->unused_slot_2 = ret + 2;
	    } else if (data->handled && data->handled[8]) {
			ret = data->handled[8]->stack_spill_pos;
			data->handled[8] = data->handled[8]->list_next;
			data->unused_slot_1 = ret + 1;
			data->unused_slot_2 = ret + 2;
			data->unused_slot_4 = ret + 4;
		} else {
			ret = ctx->stack_frame_size;
			data->unused_slot_1 = ctx->stack_frame_size + 1;
			data->unused_slot_2 = ctx->stack_frame_size + 2;
			if (sizeof(void*) == 8) {
				data->unused_slot_4 = ctx->stack_frame_size + 4;
				ctx->stack_frame_size += 8;
			} else {
				ctx->stack_frame_size += 4;
			}
		}
	} else {
		ret = IR_NULL;
	}
	return ret;
}

int32_t ir_allocate_spill_slot(ir_ctx *ctx, ir_type type, ir_reg_alloc_data *data)
{
	return ir_allocate_small_spill_slot(ctx, ir_type_size[type], data);
}

static int32_t ir_allocate_big_spill_slot(ir_ctx *ctx, int32_t size, ir_reg_alloc_data *data)
{
	int32_t ret;

	if (size <= 8) {
		if (size == 3) {
			size = 4;
		} else if (size > 4 && size < 8) {
			size = 8;
		}
		return ir_allocate_small_spill_slot(ctx, size, data);
	}

	/* Align stack allocated data to 16 byte */
	ctx->flags2 |= IR_16B_FRAME_ALIGNMENT;
	ret = IR_ALIGNED_SIZE(ctx->stack_frame_size, 16);
	size = IR_ALIGNED_SIZE(size, 8);
	ctx->stack_frame_size = ret + size;

	return ret;
}

static ir_reg ir_get_first_reg_hint(ir_ctx *ctx, ir_live_interval *ival, ir_regset available)
{
	ir_use_pos *use_pos;
	ir_reg reg;

	use_pos = ival->use_pos;
	while (use_pos) {
		reg = use_pos->hint;
		if (reg >= 0 && IR_REGSET_IN(available, reg)) {
			return reg;
		}
		use_pos = use_pos->next;
	}

	return IR_REG_NONE;
}

static ir_reg ir_try_allocate_preferred_reg(ir_ctx *ctx, ir_live_interval *ival, ir_regset available, ir_live_pos *freeUntilPos)
{
	ir_use_pos *use_pos;
	ir_reg reg;

	if (ival->flags & IR_LIVE_INTERVAL_HAS_HINT_REGS) {
		use_pos = ival->use_pos;
		while (use_pos) {
			reg = use_pos->hint;
			if (reg >= 0 && IR_REGSET_IN(available, reg)) {
				if (ival->end <= freeUntilPos[reg]) {
					/* register available for the whole interval */
					return reg;
				}
			}
			use_pos = use_pos->next;
		}
	}

	if (ival->flags & IR_LIVE_INTERVAL_HAS_HINT_REFS) {
		use_pos = ival->use_pos;
		while (use_pos) {
			if (use_pos->hint_ref > 0) {
				reg = ctx->live_intervals[ctx->vregs[use_pos->hint_ref]]->reg;
				if (reg >= 0 && IR_REGSET_IN(available, reg)) {
					if (ival->end <= freeUntilPos[reg]) {
						/* register available for the whole interval */
						return reg;
					}
				}
			}
			use_pos = use_pos->next;
		}
	}

	return IR_REG_NONE;
}

static ir_reg ir_get_preferred_reg(ir_ctx *ctx, ir_live_interval *ival, ir_regset available)
{
	ir_use_pos *use_pos;
	ir_reg reg;

	use_pos = ival->use_pos;
	while (use_pos) {
		reg = use_pos->hint;
		if (reg >= 0 && IR_REGSET_IN(available, reg)) {
			return reg;
		} else if (use_pos->hint_ref > 0) {
			reg = ctx->live_intervals[ctx->vregs[use_pos->hint_ref]]->reg;
			if (reg >= 0 && IR_REGSET_IN(available, reg)) {
				return reg;
			}
		}
		use_pos = use_pos->next;
	}

	return IR_REG_NONE;
}

static void ir_add_to_unhandled(ir_live_interval **unhandled, ir_live_interval *ival)
{
	ir_live_pos pos = ival->range.start;

	if (*unhandled == NULL
	 || pos < (*unhandled)->range.start
	 || (pos == (*unhandled)->range.start
	  && (ival->flags & (IR_LIVE_INTERVAL_HAS_HINT_REGS|IR_LIVE_INTERVAL_HAS_HINT_REFS))
	  && !((*unhandled)->flags & (IR_LIVE_INTERVAL_HAS_HINT_REGS|IR_LIVE_INTERVAL_HAS_HINT_REFS)))
	 || (pos == (*unhandled)->range.start
	  && ival->vreg > (*unhandled)->vreg)) {
		ival->list_next = *unhandled;
		*unhandled = ival;
	} else {
		ir_live_interval *prev = *unhandled;

		while (prev->list_next) {
			if (pos < prev->list_next->range.start
			 || (pos == prev->list_next->range.start
			  && (ival->flags & (IR_LIVE_INTERVAL_HAS_HINT_REGS|IR_LIVE_INTERVAL_HAS_HINT_REFS))
			  && !(prev->list_next->flags & (IR_LIVE_INTERVAL_HAS_HINT_REGS|IR_LIVE_INTERVAL_HAS_HINT_REFS)))
			 || (pos == prev->list_next->range.start
			  && ival->vreg > prev->list_next->vreg)) {
				break;
			}
			prev = prev->list_next;
		}
		ival->list_next = prev->list_next;
		prev->list_next = ival;
	}
}

/* merge sorted lists */
static void ir_merge_to_unhandled(ir_live_interval **unhandled, ir_live_interval *ival)
{
	ir_live_interval **prev;
	ir_live_pos pos;

	if (*unhandled == NULL) {
		*unhandled = ival;
		while (ival) {
			ival = ival->list_next = ival->next;
		}
	} else {
		prev = unhandled;
		while (ival) {
			pos = ival->range.start;
			while (*prev && pos >= (*prev)->range.start) {
				prev = &(*prev)->list_next;
			}
			ival->list_next = *prev;
			*prev = ival;
			prev = &ival->list_next;
			ival = ival->next;
		}
	}
#ifdef IR_DEBUG
	ival = *unhandled;
	pos = 0;

	while (ival) {
		IR_ASSERT(ival->range.start >= pos);
		pos = ival->range.start;
		ival = ival->list_next;
	}
#endif
}

static void ir_add_to_unhandled_spill(ir_live_interval **unhandled, ir_live_interval *ival)
{
	ir_live_pos pos = ival->range.start;

	if (*unhandled == NULL
	 || pos <= (*unhandled)->range.start) {
		ival->list_next = *unhandled;
		*unhandled = ival;
	} else {
		ir_live_interval *prev = *unhandled;

		while (prev->list_next) {
			if (pos <= prev->list_next->range.start) {
				break;
			}
			prev = prev->list_next;
		}
		ival->list_next = prev->list_next;
		prev->list_next = ival;
	}
}

static ir_reg ir_try_allocate_free_reg(ir_ctx *ctx, ir_live_interval *ival, ir_live_interval **active, ir_live_interval *inactive, ir_live_interval **unhandled)
{
	ir_live_pos freeUntilPos[IR_REG_NUM];
	int i, reg;
	ir_live_pos pos, next;
	ir_live_interval *other;
	ir_regset available, overlapped, scratch;

	if (IR_IS_TYPE_FP(ival->type)) {
		available = IR_REGSET_FP;
		/* set freeUntilPos of all physical registers to maxInt */
		for (i = IR_REG_FP_FIRST; i <= IR_REG_FP_LAST; i++) {
			freeUntilPos[i] = 0x7fffffff;
		}
	} else {
		available = IR_REGSET_GP;
		if (ctx->flags & IR_USE_FRAME_POINTER) {
			IR_REGSET_EXCL(available, IR_REG_FRAME_POINTER);
		}
#if defined(IR_TARGET_X86)
		if (ir_type_size[ival->type] == 1) {
			/* TODO: if no registers avialivle, we may use of one this register for already allocated interval ??? */
			IR_REGSET_EXCL(available, IR_REG_RBP);
			IR_REGSET_EXCL(available, IR_REG_RSI);
			IR_REGSET_EXCL(available, IR_REG_RDI);
		}
#endif
		/* set freeUntilPos of all physical registers to maxInt */
		for (i = IR_REG_GP_FIRST; i <= IR_REG_GP_LAST; i++) {
			freeUntilPos[i] = 0x7fffffff;
		}
	}

	available = IR_REGSET_DIFFERENCE(available, (ir_regset)ctx->fixed_regset);

	/* for each interval it in active */
	other = *active;
	while (other) {
		/* freeUntilPos[it.reg] = 0 */
		reg = other->reg;
		IR_ASSERT(reg >= 0);
		if (reg >= IR_REG_SCRATCH) {
			if (reg == IR_REG_SCRATCH) {
				available = IR_REGSET_DIFFERENCE(available, IR_REGSET_SCRATCH);
			} else {
				IR_ASSERT(reg == IR_REG_ALL);
				available = IR_REGSET_EMPTY;
			}
		} else {
			IR_REGSET_EXCL(available, reg);
		}
		other = other->list_next;
	}

	/* for each interval it in inactive intersecting with current
	 *
	 * This loop is not necessary for program in SSA form (see LSRA on SSA fig. 6),
	 * but it is still necessary after coalescing and splitting
	 */
	overlapped = IR_REGSET_EMPTY;
	other = inactive;
	pos = ival->end;
	while (other) {
		/* freeUntilPos[it.reg] = next intersection of it with current */
		if (other->current_range->start < pos) {
			next = ir_ivals_overlap(&ival->range, other->current_range);
			if (next) {
				reg = other->reg;
				IR_ASSERT(reg >= 0);
				if (reg >= IR_REG_SCRATCH) {
					ir_regset regset;

					if (reg == IR_REG_SCRATCH) {
						regset = IR_REGSET_INTERSECTION(available, IR_REGSET_SCRATCH);
					} else {
						IR_ASSERT(reg == IR_REG_ALL);
						regset = available;
					}
					overlapped = IR_REGSET_UNION(overlapped, regset);
					IR_REGSET_FOREACH(regset, reg) {
						if (next < freeUntilPos[reg]) {
							freeUntilPos[reg] = next;
						}
					} IR_REGSET_FOREACH_END();
				} else if (IR_REGSET_IN(available, reg)) {
					IR_REGSET_INCL(overlapped, reg);
					if (next < freeUntilPos[reg]) {
						freeUntilPos[reg] = next;
					}
				}
			}
		}
		other = other->list_next;
	}

	available = IR_REGSET_DIFFERENCE(available, overlapped);
	if (available != IR_REGSET_EMPTY) {

		if (ival->flags & (IR_LIVE_INTERVAL_HAS_HINT_REGS|IR_LIVE_INTERVAL_HAS_HINT_REFS)) {
			/* Try to use hint */
			reg = ir_try_allocate_preferred_reg(ctx, ival, available, freeUntilPos);
			if (reg != IR_REG_NONE) {
				ival->reg = reg;
				IR_LOG_LSRA_ASSIGN("    ---- Assign", ival, " (hint available without spilling)");
				if (*unhandled && ival->end > (*unhandled)->range.start) {
					ival->list_next = *active;
					*active = ival;
				}
				return reg;
			}
		}

		if (ival->flags & IR_LIVE_INTERVAL_SPLIT_CHILD) {
			/* Try to reuse the register previously allocated for splited interval */
			reg = ctx->live_intervals[ival->vreg]->reg;
			if (reg >= 0 && IR_REGSET_IN(available, reg)) {
				ival->reg = reg;
				IR_LOG_LSRA_ASSIGN("    ---- Assign", ival, " (available without spilling)");
				if (*unhandled && ival->end > (*unhandled)->range.start) {
					ival->list_next = *active;
					*active = ival;
				}
				return reg;
			}
		}

		/* prefer caller-saved registers to avoid save/restore in prologue/epilogue */
		scratch = IR_REGSET_INTERSECTION(available, IR_REGSET_SCRATCH);
		if (scratch != IR_REGSET_EMPTY) {
			/* prefer registers that don't conflict with the hints for the following unhandled intervals */
			if (1) {
				ir_regset non_conflicting = scratch;

				other = *unhandled;
				while (other && other->range.start < ival->range.end) {
					if (other->flags & IR_LIVE_INTERVAL_HAS_HINT_REGS) {
						reg = ir_get_first_reg_hint(ctx, other, non_conflicting);

						if (reg >= 0) {
							IR_REGSET_EXCL(non_conflicting, reg);
							if (non_conflicting == IR_REGSET_EMPTY) {
								break;
							}
						}
					}
					other = other->list_next;
				}
				if (non_conflicting != IR_REGSET_EMPTY) {
					reg = IR_REGSET_FIRST(non_conflicting);
				} else {
					reg = IR_REGSET_FIRST(scratch);
				}
			} else {
				reg = IR_REGSET_FIRST(scratch);
			}
		} else {
			reg = IR_REGSET_FIRST(available);
		}
		ival->reg = reg;
		IR_LOG_LSRA_ASSIGN("    ---- Assign", ival, " (available without spilling)");
		if (*unhandled && ival->end > (*unhandled)->range.start) {
			ival->list_next = *active;
			*active = ival;
		}
		return reg;
	}

	/* reg = register with highest freeUntilPos */
	reg = IR_REG_NONE;
	pos = 0;
	IR_REGSET_FOREACH(overlapped, i) {
		if (freeUntilPos[i] > pos) {
			pos = freeUntilPos[i];
			reg = i;
		} else if (freeUntilPos[i] == pos
				&& !IR_REGSET_IN(IR_REGSET_SCRATCH, reg)
				&& IR_REGSET_IN(IR_REGSET_SCRATCH, i)) {
			/* prefer caller-saved registers to avoid save/restore in prologue/epilogue */
			pos = freeUntilPos[i];
			reg = i;
		}
	} IR_REGSET_FOREACH_END();

	if (pos > ival->range.start) {
		/* register available for the first part of the interval */
		/* split current before freeUntilPos[reg] */
		ir_live_pos split_pos = ir_last_use_pos_before(ival, pos,
			IR_USE_MUST_BE_IN_REG | IR_USE_SHOULD_BE_IN_REG);
		if (split_pos > ival->range.start) {
			split_pos = ir_find_optimal_split_position(ctx, ival, split_pos, pos, 0);
			other = ir_split_interval_at(ctx, ival, split_pos);
			if (ival->flags & (IR_LIVE_INTERVAL_HAS_HINT_REGS|IR_LIVE_INTERVAL_HAS_HINT_REFS)) {
				ir_reg pref_reg = ir_try_allocate_preferred_reg(ctx, ival, IR_REGSET_UNION(available, overlapped), freeUntilPos);

				if (pref_reg != IR_REG_NONE) {
					ival->reg = pref_reg;
				} else {
					ival->reg = reg;
				}
			} else {
				ival->reg = reg;
			}
			IR_LOG_LSRA_ASSIGN("    ---- Assign", ival, " (available without spilling for the first part)");
			if (*unhandled && ival->end > (*unhandled)->range.start) {
				ival->list_next = *active;
				*active = ival;
			}
			ir_add_to_unhandled(unhandled, other);
			IR_LOG_LSRA("      ---- Queue", other, "");
			return reg;
		}
	}
	return IR_REG_NONE;
}

static ir_reg ir_allocate_blocked_reg(ir_ctx *ctx, ir_live_interval *ival, ir_live_interval **active, ir_live_interval **inactive, ir_live_interval **unhandled)
{
	ir_live_pos nextUsePos[IR_REG_NUM];
	ir_live_pos blockPos[IR_REG_NUM];
	int i, reg;
	ir_live_pos pos, next_use_pos;
	ir_live_interval *other, *prev;
	ir_use_pos *use_pos;
	ir_regset available, tmp_regset;

	if (!(ival->flags & IR_LIVE_INTERVAL_TEMP)) {
		use_pos = ival->use_pos;
		while (use_pos && !(use_pos->flags & IR_USE_MUST_BE_IN_REG)) {
			use_pos = use_pos->next;
		}
		if (!use_pos) {
			/* spill */
			IR_LOG_LSRA("    ---- Spill", ival, " (no use pos that must be in reg)");
			ctx->flags2 |= IR_RA_HAVE_SPILLS;
			return IR_REG_NONE;
		}
		next_use_pos = use_pos->pos;
	} else {
		next_use_pos = ival->range.end;
	}

	if (IR_IS_TYPE_FP(ival->type)) {
		available = IR_REGSET_FP;
		/* set nextUsePos of all physical registers to maxInt */
		for (i = IR_REG_FP_FIRST; i <= IR_REG_FP_LAST; i++) {
			nextUsePos[i] = 0x7fffffff;
			blockPos[i] = 0x7fffffff;
		}
	} else {
		available = IR_REGSET_GP;
		if (ctx->flags & IR_USE_FRAME_POINTER) {
			IR_REGSET_EXCL(available, IR_REG_FRAME_POINTER);
		}
#if defined(IR_TARGET_X86)
		if (ir_type_size[ival->type] == 1) {
			/* TODO: if no registers avialivle, we may use of one this register for already allocated interval ??? */
			IR_REGSET_EXCL(available, IR_REG_RBP);
			IR_REGSET_EXCL(available, IR_REG_RSI);
			IR_REGSET_EXCL(available, IR_REG_RDI);
		}
#endif
		/* set nextUsePos of all physical registers to maxInt */
		for (i = IR_REG_GP_FIRST; i <= IR_REG_GP_LAST; i++) {
			nextUsePos[i] = 0x7fffffff;
			blockPos[i] = 0x7fffffff;
		}
	}

	available = IR_REGSET_DIFFERENCE(available, (ir_regset)ctx->fixed_regset);

	if (IR_REGSET_IS_EMPTY(available)) {
		fprintf(stderr, "LSRA Internal Error: No registers available. Allocation is not possible\n");
		IR_ASSERT(0);
		exit(-1);
	}

	/* for each interval it in active */
	other = *active;
	while (other) {
		/* nextUsePos[it.reg] = next use of it after start of current */
		reg = other->reg;
		IR_ASSERT(reg >= 0);
		if (reg >= IR_REG_SCRATCH) {
			ir_regset regset;

			if (reg == IR_REG_SCRATCH) {
				regset = IR_REGSET_INTERSECTION(available, IR_REGSET_SCRATCH);
			} else {
				IR_ASSERT(reg == IR_REG_ALL);
				regset = available;
			}
			IR_REGSET_FOREACH(regset, reg) {
				blockPos[reg] = nextUsePos[reg] = 0;
			} IR_REGSET_FOREACH_END();
		} else if (IR_REGSET_IN(available, reg)) {
			if (other->flags & (IR_LIVE_INTERVAL_FIXED|IR_LIVE_INTERVAL_TEMP)) {
				blockPos[reg] = nextUsePos[reg] = 0;
			} else {
				pos = ir_first_use_pos_after(other, ival->range.start,
					IR_USE_MUST_BE_IN_REG | IR_USE_SHOULD_BE_IN_REG);
				if (pos < nextUsePos[reg]) {
					nextUsePos[reg] = pos;
				}
			}
		}
		other = other->list_next;
	}

	/* for each interval it in inactive intersecting with current */
	other = *inactive;
	while (other) {
		/* freeUntilPos[it.reg] = next intersection of it with current */
		reg = other->reg;
		IR_ASSERT(reg >= 0);
		if (reg >= IR_REG_SCRATCH) {
			ir_live_pos overlap = ir_ivals_overlap(&ival->range, other->current_range);

			if (overlap) {
				ir_regset regset;

				if (reg == IR_REG_SCRATCH) {
					regset = IR_REGSET_INTERSECTION(available, IR_REGSET_SCRATCH);
				} else {
					IR_ASSERT(reg == IR_REG_ALL);
					regset = available;
				}
				IR_REGSET_FOREACH(regset, reg) {
					if (overlap < nextUsePos[reg]) {
						nextUsePos[reg] = overlap;
					}
					if (overlap < blockPos[reg]) {
						blockPos[reg] = overlap;
					}
				} IR_REGSET_FOREACH_END();
			}
		} else if (IR_REGSET_IN(available, reg)) {
			ir_live_pos overlap = ir_ivals_overlap(&ival->range, other->current_range);

			if (overlap) {
				if (other->flags & (IR_LIVE_INTERVAL_FIXED|IR_LIVE_INTERVAL_TEMP)) {
					if (overlap < nextUsePos[reg]) {
						nextUsePos[reg] = overlap;
					}
					if (overlap < blockPos[reg]) {
						blockPos[reg] = overlap;
					}
				} else {
					pos = ir_first_use_pos_after(other, ival->range.start,
						IR_USE_MUST_BE_IN_REG | IR_USE_SHOULD_BE_IN_REG);
					if (pos < nextUsePos[reg]) {
						nextUsePos[reg] = pos;
					}
				}
			}
		}
		other = other->list_next;
	}

	/* register hinting */
	reg = IR_REG_NONE;
	if (ival->flags & (IR_LIVE_INTERVAL_HAS_HINT_REGS|IR_LIVE_INTERVAL_HAS_HINT_REFS)) {
		reg = ir_get_preferred_reg(ctx, ival, available);
	}
	if (reg == IR_REG_NONE) {
select_register:
		reg = IR_REGSET_FIRST(available);
	}

	/* reg = register with highest nextUsePos */
	pos = nextUsePos[reg];
	tmp_regset = available;
	IR_REGSET_EXCL(tmp_regset, reg);
	IR_REGSET_FOREACH(tmp_regset, i) {
		if (nextUsePos[i] > pos) {
			pos = nextUsePos[i];
			reg = i;
		}
	} IR_REGSET_FOREACH_END();

	/* if first usage of current is after nextUsePos[reg] then */
	if (next_use_pos > pos && !(ival->flags & IR_LIVE_INTERVAL_TEMP)) {
		/* all other intervals are used before current, so it is best to spill current itself */
		/* assign spill slot to current */
		/* split current before its first use position that requires a register */
		ir_live_pos split_pos;

spill_current:
		if (next_use_pos == ival->range.start) {
			IR_ASSERT(ival->use_pos && ival->use_pos->op_num == 0);
			/* split right after definition */
			split_pos = next_use_pos + 1;
		} else {
			split_pos = ir_find_optimal_split_position(ctx, ival, ival->range.start, next_use_pos - 1, 1);
		}

		if (split_pos > ival->range.start) {
			IR_LOG_LSRA("    ---- Conflict with others", ival, " (all others are used before)");
			other = ir_split_interval_at(ctx, ival, split_pos);
			IR_LOG_LSRA("    ---- Spill", ival, "");
			ir_add_to_unhandled(unhandled, other);
			IR_LOG_LSRA("    ---- Queue", other, "");
			return IR_REG_NONE;
		}
	}

	if (ival->end > blockPos[reg]) {
		/* spilling make a register free only for the first part of current */
		IR_LOG_LSRA("    ---- Conflict with others", ival, " (spilling make a register free only for the first part)");
		/* split current at optimal position before block_pos[reg] */
		ir_live_pos split_pos = ir_last_use_pos_before(ival,  blockPos[reg] + 1,
			IR_USE_MUST_BE_IN_REG | IR_USE_SHOULD_BE_IN_REG);
		if (split_pos == 0) {
			split_pos = ir_first_use_pos_after(ival, blockPos[reg],
				IR_USE_MUST_BE_IN_REG | IR_USE_SHOULD_BE_IN_REG) - 1;
			other = ir_split_interval_at(ctx, ival, split_pos);
			ir_add_to_unhandled(unhandled, other);
			IR_LOG_LSRA("      ---- Queue", other, "");
			return IR_REG_NONE;
		}
		if (split_pos >= blockPos[reg]) {
try_next_available_register:
			IR_REGSET_EXCL(available, reg);
			if (IR_REGSET_IS_EMPTY(available)) {
				fprintf(stderr, "LSRA Internal Error: Unsolvable conflict. Allocation is not possible\n");
				IR_ASSERT(0);
				exit(-1);
			}
			IR_LOG_LSRA("      ---- Restart", ival, "");
			goto select_register;
		}
		split_pos = ir_find_optimal_split_position(ctx, ival, split_pos, blockPos[reg], 1);
		other = ir_split_interval_at(ctx, ival, split_pos);
		ir_add_to_unhandled(unhandled, other);
		IR_LOG_LSRA("      ---- Queue", other, "");
	}

	/* spill intervals that currently block reg */
	prev = NULL;
	other = *active;
	while (other) {
		ir_live_pos split_pos;

		if (reg == other->reg) {
			/* split active interval for reg at position */
			ir_live_pos overlap = ir_ivals_overlap(&ival->range, other->current_range);

			if (overlap) {
				ir_live_interval *child, *child2;

				IR_ASSERT(other->type != IR_VOID);
				IR_LOG_LSRA_CONFLICT("      ---- Conflict with active", other, overlap);

				split_pos = ir_last_use_pos_before(other, ival->range.start, IR_USE_MUST_BE_IN_REG | IR_USE_SHOULD_BE_IN_REG);
				if (split_pos == 0) {
					split_pos = ival->range.start;
				}
				split_pos = ir_find_optimal_split_position(ctx, other, split_pos, ival->range.start, 1);
				if (split_pos > other->range.start) {
					child = ir_split_interval_at(ctx, other, split_pos);
					if (prev) {
						prev->list_next = other->list_next;
					} else {
						*active = other->list_next;
					}
					IR_LOG_LSRA("      ---- Finish", other, "");
				} else {
					if (ir_first_use_pos(other, IR_USE_MUST_BE_IN_REG) <= other->end) {
						if (!(ival->flags & IR_LIVE_INTERVAL_TEMP)) {
							next_use_pos = ir_first_use_pos(ival, IR_USE_MUST_BE_IN_REG);
							if (next_use_pos == ival->range.start) {
								IR_ASSERT(ival->use_pos && ival->use_pos->op_num == 0);
								/* split right after definition */
								split_pos = next_use_pos + 1;
							} else {
								split_pos = ir_find_optimal_split_position(ctx, ival, ival->range.start, next_use_pos - 1, 1);
							}

							if (split_pos > ival->range.start) {
								goto spill_current;
							}
						}
						goto try_next_available_register;
					}
					child = other;
					other->reg = IR_REG_NONE;
					if (prev) {
						prev->list_next = other->list_next;
					} else {
						*active = other->list_next;
					}
					IR_LOG_LSRA("      ---- Spill and Finish", other, " (it must not be in reg)");
				}

				split_pos = ir_first_use_pos_after(child, ival->range.start, IR_USE_MUST_BE_IN_REG | IR_USE_SHOULD_BE_IN_REG) - 1; // TODO: ???
				if (split_pos > child->range.start && split_pos < child->end) {
					ir_live_pos opt_split_pos = ir_find_optimal_split_position(ctx, child, ival->range.start, split_pos, 1);
					if (opt_split_pos > child->range.start) {
						split_pos = opt_split_pos;
					}
					child2 = ir_split_interval_at(ctx, child, split_pos);
					IR_LOG_LSRA("      ---- Spill", child, "");
					ir_add_to_unhandled(unhandled, child2);
					IR_LOG_LSRA("      ---- Queue", child2, "");
				} else if (child != other) {
					// TODO: this may cause endless loop
					ir_add_to_unhandled(unhandled, child);
					IR_LOG_LSRA("      ---- Queue", child, "");
				}
			}
			break;
		}
		prev = other;
		other = other->list_next;
	}

	/* split any inactive interval for reg at the end of its lifetime hole */
	other = *inactive;
	while (other) {
		/* freeUntilPos[it.reg] = next intersection of it with current */
		if (reg == other->reg) {
			ir_live_pos overlap = ir_ivals_overlap(&ival->range, other->current_range);

			if (overlap) {
				ir_live_interval *child;

				IR_ASSERT(other->type != IR_VOID);
				IR_LOG_LSRA_CONFLICT("      ---- Conflict with inactive", other, overlap);
				// TODO: optimal split position (this case is not tested)
				child = ir_split_interval_at(ctx, other, overlap);
				/* reset range cache */
				other->current_range = &other->range;
				ir_add_to_unhandled(unhandled, child);
				IR_LOG_LSRA("      ---- Queue", child, "");
			}
		}
		other = other->list_next;
	}

	/* current.reg = reg */
	ival->reg = reg;
	IR_LOG_LSRA_ASSIGN("    ---- Assign", ival, " (after splitting others)");

	if (*unhandled && ival->end > (*unhandled)->range.start) {
		ival->list_next = *active;
		*active = ival;
	}
	return reg;
}

static int ir_fix_dessa_tmps(ir_ctx *ctx, uint8_t type, ir_ref from, ir_ref to)
{
	ir_block *bb = ctx->data;
	ir_tmp_reg tmp_reg;

	if (to == 0) {
		if (IR_IS_TYPE_INT(type)) {
			tmp_reg.num = 0;
			tmp_reg.type = type;
			tmp_reg.start = IR_USE_SUB_REF;
			tmp_reg.end = IR_SAVE_SUB_REF;
		} else {
			IR_ASSERT(IR_IS_TYPE_FP(type));
			tmp_reg.num = 1;
			tmp_reg.type = type;
			tmp_reg.start = IR_USE_SUB_REF;
			tmp_reg.end = IR_SAVE_SUB_REF;
		}
	} else if (from != 0) {
		if (IR_IS_TYPE_INT(type)) {
			tmp_reg.num = 0;
			tmp_reg.type = type;
			tmp_reg.start = IR_USE_SUB_REF;
			tmp_reg.end = IR_SAVE_SUB_REF;
		} else {
			IR_ASSERT(IR_IS_TYPE_FP(type));
			tmp_reg.num = 1;
			tmp_reg.type = type;
			tmp_reg.start = IR_USE_SUB_REF;
			tmp_reg.end = IR_SAVE_SUB_REF;
		}
	} else {
		return 1;
	}
	if (!ir_has_tmp(ctx, bb->end, tmp_reg.num)) {
		ir_add_tmp(ctx, bb->end, bb->end, tmp_reg.num, tmp_reg);
	}
	return 1;
}

static bool ir_ival_spill_for_fuse_load(ir_ctx *ctx, ir_live_interval *ival, ir_reg_alloc_data *data)
{
	ir_use_pos *use_pos = ival->use_pos;
	ir_insn *insn;

	if (ival->flags & IR_LIVE_INTERVAL_MEM_PARAM) {
		IR_ASSERT(!ival->next && use_pos && use_pos->op_num == 0);
		insn = &ctx->ir_base[IR_LIVE_POS_TO_REF(use_pos->pos)];
		IR_ASSERT(insn->op == IR_PARAM);
		use_pos = use_pos->next;
		if (use_pos && (use_pos->next || (use_pos->flags & IR_USE_MUST_BE_IN_REG))) {
			return 0;
		}

		if (use_pos) {
			ir_block *bb = ir_block_from_live_pos(ctx, use_pos->pos);
			if (bb->loop_depth) {
				return 0;
			}
		}

		return 1;
	} else if (ival->flags & IR_LIVE_INTERVAL_MEM_LOAD) {
		insn = &ctx->ir_base[IR_LIVE_POS_TO_REF(use_pos->pos)];
		IR_ASSERT(insn->op == IR_VLOAD);
		IR_ASSERT(ctx->ir_base[insn->op2].op == IR_VAR);
		use_pos = use_pos->next;
		if (use_pos && (use_pos->next || (use_pos->flags & IR_USE_MUST_BE_IN_REG))) {
			return 0;
		}

		if (use_pos) {
			ir_block *bb = ir_block_from_live_pos(ctx, use_pos->pos);
			if (bb->loop_depth && bb != ir_block_from_live_pos(ctx, ival->use_pos->pos)) {
				return 0;
			}
			/* check if VAR may be clobbered between VLOAD and use */
			ir_use_list *use_list = &ctx->use_lists[insn->op2];
			ir_ref n = use_list->count;
			ir_ref *p = &ctx->use_edges[use_list->refs];
			for (; n > 0; p++, n--) {
				ir_ref use = *p;
				if (ctx->ir_base[use].op == IR_VSTORE) {
					if (use > IR_LIVE_POS_TO_REF(ival->use_pos->pos) && use < IR_LIVE_POS_TO_REF(use_pos->pos)) {
						return 0;
					}
				} else if (ctx->ir_base[use].op == IR_VADDR) {
					return 0;
				}
			}
		}
		ival->stack_spill_pos = ctx->ir_base[insn->op2].op3;

		return 1;
	}
	return 0;
}

static void ir_assign_bound_spill_slots(ir_ctx *ctx)
{
	ir_hashtab_bucket *b = ctx->binding->data;
	uint32_t n = ctx->binding->count;
	uint32_t v;
	ir_live_interval *ival;

	while (n > 0) {
		v = ctx->vregs[b->key];
		if (v) {
			ival = ctx->live_intervals[v];
			if (ival
			 && ival->stack_spill_pos == -1
			 && (ival->next || ival->reg == IR_REG_NONE)) {
				IR_ASSERT(b->val < 0);
				/* special spill slot */
				ival->stack_spill_pos = -b->val;
				ival->flags |= IR_LIVE_INTERVAL_SPILLED | IR_LIVE_INTERVAL_SPILL_SPECIAL;
			}
		}
		b++;
		n--;
	}
}

static int ir_linear_scan(ir_ctx *ctx)
{
	uint32_t b;
	ir_block *bb;
	ir_live_interval *unhandled = NULL;
	ir_live_interval *active = NULL;
	ir_live_interval *inactive = NULL;
	ir_live_interval *ival, *other, *prev;
	int j;
	ir_live_pos position;
	ir_reg reg;
	ir_reg_alloc_data data;
	ir_ref vars = ctx->vars;

	if (!ctx->live_intervals) {
		return 0;
	}

	if (ctx->flags2 & IR_LR_HAVE_DESSA_MOVES) {
		/* Add fixed intervals for temporary registers used for DESSA moves */
		for (b = 1, bb = &ctx->cfg_blocks[1]; b <= ctx->cfg_blocks_count; b++, bb++) {
			IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
			if (bb->flags & IR_BB_DESSA_MOVES) {
				ctx->data = bb;
				ir_gen_dessa_moves(ctx, b, ir_fix_dessa_tmps);
			}
		}
	}

	ctx->data = &data;
	ctx->stack_frame_size = 0;
	data.unused_slot_4 = 0;
	data.unused_slot_2 = 0;
	data.unused_slot_1 = 0;
	data.handled = NULL;

	while (vars) {
		ir_ref var = vars;
		ir_insn *insn = &ctx->ir_base[var];

		IR_ASSERT(insn->op == IR_VAR || insn->op == IR_ALLOCA);
		vars = insn->op3; /* list next */

		if (insn->op == IR_VAR) {
			ir_ref slot = ir_allocate_spill_slot(ctx, insn->type, &data);;
			ir_use_list *use_list;
			ir_ref n, *p;

			insn->op3 = slot;
			use_list = &ctx->use_lists[var];
			n = use_list->count;
			p = &ctx->use_edges[use_list->refs];
			for (; n > 0; p++, n--) {
				insn = &ctx->ir_base[*p];
				if (insn->op == IR_VADDR) {
					insn->op3 = slot;
				}
			}
		} else {
			ir_insn *val = &ctx->ir_base[insn->op2];

			IR_ASSERT(IR_IS_CONST_REF(insn->op2));
			IR_ASSERT(IR_IS_TYPE_INT(val->type));
			IR_ASSERT(!IR_IS_SYM_CONST(val->op));
			IR_ASSERT(IR_IS_TYPE_UNSIGNED(val->type) || val->val.i64 >= 0);
			IR_ASSERT(val->val.i64 < 0x7fffffff);

			insn->op3 = ir_allocate_big_spill_slot(ctx, val->val.i32, &data);
		}
	}

	for (j = ctx->vregs_count; j != 0; j--) {
		ival = ctx->live_intervals[j];
		if (ival) {
			if (!(ival->flags & (IR_LIVE_INTERVAL_MEM_PARAM|IR_LIVE_INTERVAL_MEM_LOAD))
					|| !ir_ival_spill_for_fuse_load(ctx, ival, &data)) {
				ir_add_to_unhandled(&unhandled, ival);
			}
		}
	}

	ival = ctx->live_intervals[0];
	if (ival) {
		ir_merge_to_unhandled(&unhandled, ival);
	}

	/* vregs + tmp + fixed + SRATCH + ALL */
	for (j = ctx->vregs_count + 1; j <= ctx->vregs_count + IR_REG_NUM + 2; j++) {
		ival = ctx->live_intervals[j];
		if (ival) {
			ival->current_range = &ival->range;
			ival->list_next = inactive;
			inactive = ival;
		}
	}

	ctx->flags2 &= ~(IR_RA_HAVE_SPLITS|IR_RA_HAVE_SPILLS);

#ifdef IR_DEBUG
	if (ctx->flags & IR_DEBUG_RA) {
		fprintf(stderr, "----\n");
		ir_dump_live_ranges(ctx, stderr);
		fprintf(stderr, "---- Start LSRA\n");
	}
#endif

	while (unhandled) {
		ival = unhandled;
		ival->current_range = &ival->range;
		unhandled = ival->list_next;
		position = ival->range.start;

		IR_LOG_LSRA("  ---- Processing", ival, "...");

		/* for each interval i in active */
		other = active;
		prev = NULL;
		while (other) {
			ir_live_range *r = other->current_range;

			IR_ASSERT(r);
			if (r->end <= position) {
				do {
					r = r->next;
				} while (r && r->end <= position);
				if (!r) {
					/* move i from active to handled */
					other = other->list_next;
					if (prev) {
						prev->list_next = other;
					} else {
						active = other;
					}
					continue;
				}
				other->current_range = r;
			}
			if (position < r->start) {
				/* move i from active to inactive */
				if (prev) {
					prev->list_next = other->list_next;
				} else {
					active = other->list_next;
				}
				other->list_next = inactive;
				inactive = other;
			} else {
				prev = other;
			}
			other = prev ? prev->list_next : active;
		}

		/* for each interval i in inactive */
		other = inactive;
		prev = NULL;
		while (other) {
			ir_live_range *r = other->current_range;

			IR_ASSERT(r);
			if (r->end <= position) {
				do {
					r = r->next;
				} while (r && r->end <= position);
				if (!r) {
					/* move i from inactive to handled */
					other = other->list_next;
					if (prev) {
						prev->list_next = other;
					} else {
						inactive = other;
					}
					continue;
				}
				other->current_range = r;
			}
			if (position >= r->start) {
				/* move i from inactive to active */
				if (prev) {
					prev->list_next = other->list_next;
				} else {
					inactive = other->list_next;
				}
				other->list_next = active;
				active = other;
			} else {
				prev = other;
			}
			other = prev ? prev->list_next : inactive;
		}

		reg = ir_try_allocate_free_reg(ctx, ival, &active, inactive, &unhandled);
		if (reg == IR_REG_NONE) {
			reg = ir_allocate_blocked_reg(ctx, ival, &active, &inactive, &unhandled);
		}
	}

#if 0 //def IR_DEBUG
	/* all intervals must be processed */
	ival = active;
	while (ival) {
		IR_ASSERT(!ival->next);
		ival = ival->list_next;
	}
	ival = inactive;
	while (ival) {
		IR_ASSERT(!ival->next);
		ival = ival->list_next;
	}
#endif

	if (ctx->flags2 & (IR_RA_HAVE_SPLITS|IR_RA_HAVE_SPILLS)) {

		if (ctx->binding) {
			ir_assign_bound_spill_slots(ctx);
		}

		/* Use simple linear-scan (without holes) to allocate and reuse spill slots */
		unhandled = NULL;
		for (j = ctx->vregs_count; j != 0; j--) {
			ival = ctx->live_intervals[j];
			if (ival
			 && (ival->next || ival->reg == IR_REG_NONE)
			 && ival->stack_spill_pos == -1) {
				ival->flags |= IR_LIVE_INTERVAL_SPILLED;
				if (!(ival->flags & IR_LIVE_INTERVAL_MEM_PARAM)) {
					ir_live_range *r;

					other = ival;
					while (other->next) {
						other = other->next;
					}
					r = &other->range;
					while (r->next) {
						r = r->next;
					}
					ival->end = r->end;
					ir_add_to_unhandled_spill(&unhandled, ival);
				}
			}
		}

		if (unhandled) {
			uint8_t size;
			ir_live_interval *handled[9] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
			ir_live_interval *old;

			data.handled = handled;
			active = NULL;
			while (unhandled) {
				ival = unhandled;
				ival->current_range = &ival->range;
				unhandled = ival->list_next;
				position = ival->range.start;

				/* for each interval i in active */
				other = active;
				prev = NULL;
				while (other) {
					if (other->end <= position) {
						/* move i from active to handled */
						if (prev) {
							prev->list_next = other->list_next;
						} else {
							active = other->list_next;
						}
						size = ir_type_size[other->type];
						IR_ASSERT(size == 1 || size == 2 || size == 4 || size == 8);
						old = handled[size];
						while (old) {
							if (old->stack_spill_pos == other->stack_spill_pos) {
								break;
							}
							old = old->list_next;
						}
						if (!old) {
							other->list_next = handled[size];
							handled[size] = other;
						}
					} else {
						prev = other;
					}
					other = prev ? prev->list_next : active;
				}

				ival->stack_spill_pos = ir_allocate_spill_slot(ctx, ival->type, &data);
				if (unhandled && ival->end > unhandled->range.start) {
					ival->list_next = active;
					active = ival;
				} else {
					size = ir_type_size[ival->type];
					IR_ASSERT(size == 1 || size == 2 || size == 4 || size == 8);
					old = handled[size];
					while (old) {
						if (old->stack_spill_pos == ival->stack_spill_pos) {
							break;
						}
						old = old->list_next;
					}
					if (!old) {
						ival->list_next = handled[size];
						handled[size] = ival;
					}
				}
			}
			data.handled = NULL;
		}
	}

#ifdef IR_TARGET_X86
	if (ctx->flags2 & IR_HAS_FP_RET_SLOT) {
		ctx->ret_slot = ir_allocate_spill_slot(ctx, IR_DOUBLE, &data);
	} else if (ctx->ret_type == IR_FLOAT || ctx->ret_type == IR_DOUBLE) {
		ctx->ret_slot = ir_allocate_spill_slot(ctx, ctx->ret_type, &data);
	} else {
		ctx->ret_slot = -1;
	}
#endif

#ifdef IR_DEBUG
	if (ctx->flags & IR_DEBUG_RA) {
		fprintf(stderr, "---- Finish LSRA\n");
		ir_dump_live_ranges(ctx, stderr);
		fprintf(stderr, "----\n");
	}
#endif

	return 1;
}

static bool needs_spill_reload(ir_ctx *ctx, ir_live_interval *ival, uint32_t b0, ir_bitset available)
{
    ir_worklist worklist;
	ir_block *bb;
	uint32_t b, *p, n;

	ir_worklist_init(&worklist, ctx->cfg_blocks_count + 1);
	ir_worklist_push(&worklist, b0);
	while (ir_worklist_len(&worklist) != 0) {
		b = ir_worklist_pop(&worklist);
        bb = &ctx->cfg_blocks[b];
		if (bb->flags & (IR_BB_ENTRY|IR_BB_START)) {
			ir_worklist_free(&worklist);
			return 1;
		}
		n = bb->predecessors_count;
		for (p = &ctx->cfg_edges[bb->predecessors]; n > 0; p++, n--) {
			b = *p;
			bb = &ctx->cfg_blocks[b];

			if (!ir_ival_covers(ival, IR_SAVE_LIVE_POS_FROM_REF(bb->end))) {
				ir_worklist_free(&worklist);
				return 1;
			} else if (!ir_bitset_in(available, b)) {
				ir_worklist_push(&worklist, b);
			}
		}
	}
	ir_worklist_free(&worklist);
	return 0;
}

static bool needs_spill_load(ir_ctx *ctx, ir_live_interval *ival, ir_use_pos *use_pos)
{
	if (use_pos->next
	 && use_pos->op_num == 1
	 && use_pos->next->pos == use_pos->pos
	 && !(use_pos->next->flags & IR_USE_MUST_BE_IN_REG)) {
		/* Support for R2 = ADD(R1, R1) */
		use_pos = use_pos->next;
	}
	return use_pos->next && use_pos->next->op_num != 0;
}

static void ir_set_fused_reg(ir_ctx *ctx, ir_ref root, ir_ref ref_and_op, int8_t reg)
{
	char key[10];

	IR_ASSERT(reg != IR_REG_NONE);
	if (!ctx->fused_regs) {
		ctx->fused_regs = ir_mem_malloc(sizeof(ir_strtab));
		ir_strtab_init(ctx->fused_regs, 8, 128);
	}
	memcpy(key, &root, sizeof(ir_ref));
	memcpy(key + 4, &ref_and_op, sizeof(ir_ref));
	ir_strtab_lookup(ctx->fused_regs, key, 8, 0x10000000 | reg);
}

static void assign_regs(ir_ctx *ctx)
{
	ir_ref i;
	ir_live_interval *ival, *top_ival;
	ir_use_pos *use_pos;
	int8_t reg, old_reg;
	ir_ref ref;
	ir_regset used_regs = 0;

	if (!ctx->regs) {
		ctx->regs = ir_mem_malloc(sizeof(ir_regs) * ctx->insns_count);
		memset(ctx->regs, IR_REG_NONE, sizeof(ir_regs) * ctx->insns_count);
	}

	if (!(ctx->flags2 & (IR_RA_HAVE_SPLITS|IR_RA_HAVE_SPILLS))) {
		for (i = 1; i <= ctx->vregs_count; i++) {
			ival = ctx->live_intervals[i];
			if (ival) {
				do {
					if (ival->reg != IR_REG_NONE) {
						reg = ival->reg;
						IR_REGSET_INCL(used_regs, reg);
						use_pos = ival->use_pos;
						while (use_pos) {
							ref = (use_pos->hint_ref < 0) ? -use_pos->hint_ref : IR_LIVE_POS_TO_REF(use_pos->pos);
							ir_set_alocated_reg(ctx, ref, use_pos->op_num, reg);
							use_pos = use_pos->next;
						}
					}
					ival = ival->next;
				} while (ival);
			}
		}
	} else {
		ir_bitset available = ir_bitset_malloc(ctx->cfg_blocks_count + 1);

		for (i = 1; i <= ctx->vregs_count; i++) {
			top_ival = ival = ctx->live_intervals[i];
			if (ival) {
				if (!(ival->flags & IR_LIVE_INTERVAL_SPILLED)) {
					do {
						if (ival->reg != IR_REG_NONE) {
							IR_REGSET_INCL(used_regs, ival->reg);
							use_pos = ival->use_pos;
							while (use_pos) {
								reg = ival->reg;
								ref = IR_LIVE_POS_TO_REF(use_pos->pos);
								if (use_pos->hint_ref < 0) {
									ref = -use_pos->hint_ref;
								}
								ir_set_alocated_reg(ctx, ref, use_pos->op_num, reg);

								use_pos = use_pos->next;
							}
						}
						ival = ival->next;
					} while (ival);
				} else {
					do {
						if (ival->reg != IR_REG_NONE) {
							ir_ref prev_use_ref = IR_UNUSED;

							ir_bitset_clear(available, ir_bitset_len(ctx->cfg_blocks_count + 1));
							IR_REGSET_INCL(used_regs, ival->reg);
							use_pos = ival->use_pos;
							while (use_pos) {
								reg = ival->reg;
								ref = IR_LIVE_POS_TO_REF(use_pos->pos);
								// TODO: Insert spill loads and stores in optimal positions (resolution)
								if (use_pos->op_num == 0) {
									if ((ctx->ir_base[ref].op == IR_COPY
									  || ctx->ir_base[ref].op == IR_BITCAST
									  || ctx->ir_base[ref].op == IR_TRUNC)
									 && !IR_IS_CONST_REF(ctx->ir_base[ref].op1)
									 && ctx->vregs[ctx->ir_base[ref].op1] == (uint32_t)i) {
										/* register reuse */
										ir_set_alocated_reg(ctx, ref, use_pos->op_num, reg);
										prev_use_ref = ref;
										use_pos = use_pos->next;
										continue;
									}
									ir_bitset_clear(available, ir_bitset_len(ctx->cfg_blocks_count + 1));
									if (ctx->ir_base[ref].op == IR_PHI) {
										/* Spilled PHI var is passed through memory */
										reg = IR_REG_NONE;
										prev_use_ref = IR_UNUSED;
									} else if (ctx->ir_base[ref].op == IR_PARAM
									 && (ival->flags & IR_LIVE_INTERVAL_MEM_PARAM)) {
										/* Stack PARAM var is passed through memory */
										reg = IR_REG_NONE;
									} else {
										uint32_t use_b = ctx->cfg_map[ref];

										if (ir_ival_covers(ival, IR_SAVE_LIVE_POS_FROM_REF(ctx->cfg_blocks[use_b].end))) {
											ir_bitset_incl(available, use_b);
										}
										if (top_ival->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) {
											reg |= IR_REG_SPILL_SPECIAL;
										} else {
											reg |= IR_REG_SPILL_STORE;
										}
										prev_use_ref = ref;
									}
								} else if ((!prev_use_ref || ctx->cfg_map[prev_use_ref] != ctx->cfg_map[ref])
								 && needs_spill_reload(ctx, ival, ctx->cfg_map[ref], available)) {
									if (!(use_pos->flags & IR_USE_MUST_BE_IN_REG)
									 && use_pos->hint != reg
//									 && ctx->ir_base[ref].op != IR_CALL
//									 && ctx->ir_base[ref].op != IR_TAILCALL) {
									 && ctx->ir_base[ref].op != IR_SNAPSHOT
									 && !needs_spill_load(ctx, ival, use_pos)) {
										/* fuse spill load (valid only when register is not reused) */
										reg = IR_REG_NONE;
										if (use_pos->next
										 && use_pos->op_num == 1
										 && use_pos->next->pos == use_pos->pos
										 && !(use_pos->next->flags & IR_USE_MUST_BE_IN_REG)) {
											/* Support for R2 = BINOP(R1, R1) */
											if (use_pos->hint_ref < 0) {
												ref = -use_pos->hint_ref;
											}
											ir_set_alocated_reg(ctx, ref, use_pos->op_num, reg);
											use_pos = use_pos->next;
										}
									} else {
										if (top_ival->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) {
											reg |= IR_REG_SPILL_SPECIAL;
										} else {
											reg |= IR_REG_SPILL_LOAD;
										}
										if (ctx->ir_base[ref].op != IR_SNAPSHOT && !(use_pos->flags & IR_PHI_USE)) {
											uint32_t use_b = ctx->cfg_map[ref];

											if (ir_ival_covers(ival, IR_SAVE_LIVE_POS_FROM_REF(ctx->cfg_blocks[use_b].end))) {
												ir_bitset_incl(available, use_b);
											}
											prev_use_ref = ref;
										}
									}
									if (use_pos->hint_ref < 0
									 && (old_reg = ir_get_alocated_reg(ctx, -use_pos->hint_ref, use_pos->op_num)) != IR_REG_NONE) {
										if (top_ival->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) {
											reg |= IR_REG_SPILL_SPECIAL;
										} else {
											reg |= IR_REG_SPILL_LOAD;
										}
										if (reg != old_reg) {
											IR_ASSERT(ctx->rules[-use_pos->hint_ref] & IR_FUSED);
											ctx->rules[-use_pos->hint_ref] |= IR_FUSED_REG;
											ir_set_fused_reg(ctx, ref, -use_pos->hint_ref * sizeof(ir_ref) + use_pos->op_num, reg);
											use_pos = use_pos->next;
											continue;
										}
									}
								} else if (use_pos->flags & IR_PHI_USE) {
									IR_ASSERT(use_pos->hint_ref < 0);
									IR_ASSERT(ctx->vregs[-use_pos->hint_ref]);
									IR_ASSERT(ctx->live_intervals[ctx->vregs[-use_pos->hint_ref]]);
									if (ctx->live_intervals[ctx->vregs[-use_pos->hint_ref]]->flags & IR_LIVE_INTERVAL_SPILLED) {
										/* Spilled PHI var is passed through memory */
										reg = IR_REG_NONE;
									}
								} else if (use_pos->hint_ref < 0
										&& (old_reg = ir_get_alocated_reg(ctx, -use_pos->hint_ref, use_pos->op_num)) != IR_REG_NONE) {
									if (reg != old_reg) {
										IR_ASSERT(ctx->rules[-use_pos->hint_ref] & IR_FUSED);
										ctx->rules[-use_pos->hint_ref] |= IR_FUSED_REG;
										ir_set_fused_reg(ctx, ref, -use_pos->hint_ref * sizeof(ir_ref) + use_pos->op_num, reg);
										use_pos = use_pos->next;
										continue;
									}
								} else {
									/* reuse register without spill load */
								}
								if (use_pos->hint_ref < 0) {
									ref = -use_pos->hint_ref;
								}
								ir_set_alocated_reg(ctx, ref, use_pos->op_num, reg);

								use_pos = use_pos->next;
							}
						} else if (!(top_ival->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL)) {
							use_pos = ival->use_pos;
							while (use_pos) {
								ref = IR_LIVE_POS_TO_REF(use_pos->pos);
								if (ctx->ir_base[ref].op == IR_SNAPSHOT) {
									IR_ASSERT(use_pos->hint_ref >= 0);
									/* A reference to a CPU spill slot */
									reg = IR_REG_SPILL_STORE | IR_REG_STACK_POINTER;
									ir_set_alocated_reg(ctx, ref, use_pos->op_num, reg);
								}
								use_pos = use_pos->next;
							}
						}
						ival = ival->next;
					} while (ival);
				}
			}
		}
		ir_mem_free(available);
	}

	/* Temporary registers */
	ival = ctx->live_intervals[0];
	if (ival) {
		do {
			IR_ASSERT(ival->reg != IR_REG_NONE);
			IR_REGSET_INCL(used_regs, ival->reg);
			reg = ival->reg;
			if (ival->tmp_op_num > 0) {
				ir_insn *insn = &ctx->ir_base[ival->tmp_ref];

				if (ival->tmp_op_num <= insn->inputs_count) {
					ir_ref *ops = insn->ops;
					if (IR_IS_CONST_REF(ops[ival->tmp_op_num])) {
						/* constant rematerialization */
						reg |= IR_REG_SPILL_LOAD;
					} else if (ctx->ir_base[ops[ival->tmp_op_num]].op == IR_ALLOCA
							|| ctx->ir_base[ops[ival->tmp_op_num]].op == IR_VADDR) {
						/* local address rematerialization */
						reg |= IR_REG_SPILL_LOAD;
					}
				}
			}
			ir_set_alocated_reg(ctx, ival->tmp_ref, ival->tmp_op_num, reg);
			ival = ival->next;
		} while (ival);
	}

	if (ctx->fixed_stack_frame_size != -1) {
		ctx->used_preserved_regs = (ir_regset)ctx->fixed_save_regset;
		if (IR_REGSET_DIFFERENCE(IR_REGSET_INTERSECTION(used_regs, IR_REGSET_PRESERVED),
			ctx->used_preserved_regs)) {
			// TODO: Preserved reg and fixed frame conflict ???
			// IR_ASSERT(0 && "Preserved reg and fixed frame conflict");
		}
	} else {
		ctx->used_preserved_regs = IR_REGSET_UNION((ir_regset)ctx->fixed_save_regset,
			IR_REGSET_DIFFERENCE(IR_REGSET_INTERSECTION(used_regs, IR_REGSET_PRESERVED),
				(ctx->flags & IR_FUNCTION) ? (ir_regset)ctx->fixed_regset : IR_REGSET_PRESERVED));
	}

	ir_fix_stack_frame(ctx);
}

int ir_reg_alloc(ir_ctx *ctx)
{
	if (ir_linear_scan(ctx)) {
		assign_regs(ctx);
		return 1;
	}
	return 0;
}
