11 files changed, 1155 insertions, 0 deletions
diff --git a/mdk-stage1/dietlibc/sparc/Makefile.add b/mdk-stage1/dietlibc/sparc/Makefile.add
new file mode 100644
index 000000000..a12a0446a
--- /dev/null
+++ b/mdk-stage1/dietlibc/sparc/Makefile.add
@@ -0,0 +1,3 @@
+
+CFLAGS+=-mcpu=supersparc -Os
+override VPATH=sparc:syscalls.s:lib
diff --git a/mdk-stage1/dietlibc/sparc/__longjmp.S b/mdk-stage1/dietlibc/sparc/__longjmp.S
new file mode 100644
index 000000000..81dd24af2
--- /dev/null
+++ b/mdk-stage1/dietlibc/sparc/__longjmp.S
@@ -0,0 +1,66 @@
+#define _ASM
+#define _SETJMP_H
+#include <bits/setjmp.h>
+
+
+#define ENV(base,reg) [%base + (reg * 4)]
+#define ST_FLUSH_WINDOWS 3
+#define RW_FP [%fp + 0x48]
+
+.text
+.global __longjmp
+__longjmp:
+	/* Store our arguments in global registers so we can still
+	   use them while unwinding frames and their register windows.  */
+
+	ld ENV(o0,JB_FP), %g3   /* Cache target FP in register %g3.  */
+	mov %o0, %g1            /* ENV in %g1 */
+	orcc %o1, %g0, %g2      /* VAL in %g2 */
+	be,a 0f                 /* Branch if zero; else skip delay slot.  */
+	 mov 1, %g2             /* Delay slot only hit if zero: VAL = 1.  */
+0:
+	xor %fp, %g3, %o0
+	add %fp, 512, %o1
+	andncc %o0, 4095, %o0
+	bne .Lthread
+	 cmp %o1, %g3
+	bl .Lthread
+
+	/* Now we will loop, unwinding the register windows up the stack
+	   until the restored %fp value matches the target value in %g3.  */
+
+.Lloop:
+	cmp %fp, %g3            /* Have we reached the target frame? */
+	bl,a .Lloop             /* Loop while current fp is below target.  */
+	 restore                /* Unwind register window in delay slot.  */
+	be,a .Lfound            /* Better have hit it exactly.  */
+	 ld ENV(g1,JB_SP), %o0  /* Delay slot: extract target SP.  */
+
+.Lthread:
+	/*
+	 * Do a "flush register windows trap".  The trap handler in the
+	 * kernel writes all the register windows to their stack slots, and
+	 * marks them all as invalid (needing to be sucked up from the
+	 * stack when used).  This ensures that all information needed to
+	 * unwind to these callers is in memory, not in the register
+	 * windows.
+	 */
+	ta      ST_FLUSH_WINDOWS
+	ld      ENV(g1,JB_PC), %o7 /* Set return PC. */
+	ld      ENV(g1,JB_SP), %fp /* Set saved SP on restore below. */
+	sub     %fp, 64, %sp    /* Allocate a register frame. */
+	st      %g3, RW_FP      /* Set saved FP on restore below. */
+	retl
+	 restore %g2, 0, %o0    /* Restore values from above register frame. */
+
+.Lfound:
+	/* We have unwound register windows so %fp matches the target.  */
+	mov %o0, %sp            /* OK, install new SP.  */
+
+.Lsp_ok:
+	ld ENV(g1,JB_PC), %o0   /* Extract target return PC.  */
+	jmp %o0 + 8             /* Return there.  */
+	 mov %g2, %o0           /* Delay slot: set return value.  */
+
+.size __longjmp, . - __longjmp
+
diff --git a/mdk-stage1/dietlibc/sparc/fork.S b/mdk-stage1/dietlibc/sparc/fork.S
new file mode 100644
index 000000000..150839971
--- /dev/null
+++ b/mdk-stage1/dietlibc/sparc/fork.S
@@ -0,0 +1,19 @@
+#include "syscalls.h"
+
+.text
+.global fork
+fork:
+	mov	2, %g1
+	ta	0x10
+	bcc,a	1f
+	nop
+
+	sethi   %hi(errno), %o3
+	or      %o3, %lo(errno), %o3
+	st	%i0, [%o3]
+
+	retl
+	mov	-1, %o0
+1:	dec	%o1
+	retl
+	and	%o0, %o1, %o0
diff --git a/mdk-stage1/dietlibc/sparc/mmap.c b/mdk-stage1/dietlibc/sparc/mmap.c
new file mode 100644
index 000000000..25ebdc24e
--- /dev/null
+++ b/mdk-stage1/dietlibc/sparc/mmap.c
@@ -0,0 +1,43 @@
+#include <linux/types.h>
+#include <linux/unistd.h>
+
+#define __SYSCALL_STRING                                                \
+        "ta     0x10;"                                                  \
+        "bcs    2f;"                                                    \
+        " nop;"                                                         \
+        "1:"                                                            \
+        ".subsection 2;"                                                \
+        "2:"                                                            \
+        "save   %%sp, -192, %%sp;"                                      \
+        "call   __errno_location;"                                      \
+        " nop;"                                                         \
+        "st     %%i0,[%%o0];"                                           \
+        "ba     1b;"                                                    \
+        " restore %%g0, -1, %%o0;"                                      \
+        ".previous;"
+
+#define __SYSCALL_CLOBBERS "g2", "g3", "g4", "g5", "g7",                \
+        "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7",                 \
+        "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15",           \
+        "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",         \
+        "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31",         \
+        "cc", "memory"
+
+#define inline_syscall6(name,arg1,arg2,arg3,arg4,arg5,arg6)             \
+({                                                                      \
+        register long __o0 __asm__ ("o0") = (long)(arg1);               \
+        register long __o1 __asm__ ("o1") = (long)(arg2);               \
+        register long __o2 __asm__ ("o2") = (long)(arg3);               \
+        register long __o3 __asm__ ("o3") = (long)(arg4);               \
+        register long __o4 __asm__ ("o4") = (long)(arg5);               \
+        register long __o5 __asm__ ("o5") = (long)(arg6);               \
+        register long __g1 __asm__ ("g1") = __NR_##name;                \
+        __asm__ (__SYSCALL_STRING : "=r" (__g1), "=r" (__o0) :          \
+                 "0" (__g1), "1" (__o0), "r" (__o1), "r" (__o2),        \
+                 "r" (__o3), "r" (__o4), "r" (__o5) :                   \
+                 __SYSCALL_CLOBBERS);                                   \
+        __o0;                                                           \
+})
+int mmap(void*start,size_t length,int prot,int flags,int fd,off_t offset) {
+  return inline_syscall6(mmap,start,length,prot,flags,fd,offset);
+}
diff --git a/mdk-stage1/dietlibc/sparc/pipe.S b/mdk-stage1/dietlibc/sparc/pipe.S
new file mode 100644
index 000000000..b8ad1d251
--- /dev/null
+++ b/mdk-stage1/dietlibc/sparc/pipe.S
@@ -0,0 +1,20 @@
+#include "syscalls.h"
+
+.text
+.global pipe
+pipe:
+	mov	__NR_pipe, %g1
+	ta 0x10
+	bcc,a	.Lnoerror
+	nop
+	save	%sp, -96, %sp
+	call	__errno_location
+	nop
+	st	%i0, [ %o0 ]
+	ret
+	restore	%g0, -1, %o0
+.Lnoerror:
+	st	%o0, [ %o2 ]
+	st	%o1, [ %o2 + 4 ]
+	retl
+	mov	%g0, %o0
diff --git a/mdk-stage1/dietlibc/sparc/setjmp.S b/mdk-stage1/dietlibc/sparc/setjmp.S
new file mode 100644
index 000000000..be77af3c4
--- /dev/null
+++ b/mdk-stage1/dietlibc/sparc/setjmp.S
@@ -0,0 +1,35 @@
+#define _ASM
+#define _SETJMP_H
+#include <bits/setjmp.h>
+
+#define ST_FLUSH_WINDOWS        0x03
+
+.section	.rodata
+.text
+.globl __setjmp
+__setjmp:
+        b       1f
+         set    0, %o1
+.size __setjmp,.-__setjmp
+
+.globl setjmp
+setjmp:
+        set     1, %o1
+.size setjmp,.-setjmp
+
+.globl __sigsetjmp
+__sigsetjmp:
+1:
+        /* Save our PC, SP and FP.  Save the signal mask if requested with
+           a tail-call for simplicity; it always returns zero.  */
+        ta      ST_FLUSH_WINDOWS
+
+        st      %o7, [%o0 + (JB_PC * 4)]
+        st      %sp, [%o0 + (JB_SP * 4)]
+        st      %fp, [%o0 + (JB_FP * 4)]
+
+        mov     %o7, %g1
+        call    __sigjmp_save
+         mov    %g1, %o7
+.size __sigsetjmp,.-__sigsetjmp
+
diff --git a/mdk-stage1/dietlibc/sparc/start.S b/mdk-stage1/dietlibc/sparc/start.S
new file mode 100644
index 000000000..e948aaddc
--- /dev/null
+++ b/mdk-stage1/dietlibc/sparc/start.S
@@ -0,0 +1,46 @@
+#include "start.h"
+
+#ifdef __sparc__
+        .section ".text"
+        .align 4
+        .global _start
+        .type _start,@function
+_start:
+
+  /* Terminate the stack frame, and reserve space for functions to
+     drop their arguments.  */
+        mov     %g0, %fp
+        sub     %sp, 6*4, %sp
+
+  /* Extract the arguments and environment as encoded on the stack.  The
+     argument info starts after one register window (16 words) past the SP.  */
+        ld      [%sp+22*4], %o0
+        add     %sp, 23*4, %o1
+	add	%o1, %o0, %o2
+	add	%o2, %o0, %o2
+	add	%o2, %o0, %o2
+	add	%o2, %o0, %o2
+	add	%o2, 4, %o2
+
+	sethi   %hi(environ), %o3
+	or      %o3, %lo(environ), %o3
+	st	%o2, [%o3]
+
+  /* When starting a binary via the dynamic linker, %g1 contains the
+     address of the shared library termination function, which will be
+     registered with atexit().  If we are statically linked, this will
+     be NULL.  */
+
+  /* Let libc do the rest of the initialization, and call main.  */
+	call	main
+        mov     %g1, %o5
+
+	b	exit
+	mov %o0, %i0
+
+  /* Die very horribly if exit returns.  */
+        unimp
+
+        .size _start, .-_start
+#endif
+
diff --git a/mdk-stage1/dietlibc/sparc/udiv.S b/mdk-stage1/dietlibc/sparc/udiv.S
new file mode 100644
index 000000000..87479e7bd
--- /dev/null
+++ b/mdk-stage1/dietlibc/sparc/udiv.S
@@ -0,0 +1,363 @@
+#ifdef __sparc__
+   /* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .udiv	name of function to generate
+ *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1
+ *  false		false=true => signed; false=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N		how many bits per iteration we try to get (4)
+ *  WORDSIZE	total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS	number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q		the partial quotient under development (initially 0)
+ *  R		the remainder so far, initially the dividend
+ *  ITER	number of main division loop iterations required;
+ *		equal to ceil(log2(quotient) / N).  Note that this
+ *		is the log base (2^N) of the quotient.
+ *  V		the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+
+
+
+#define C_LABEL(name) name:
+
+#define C_SYMBOL_NAME(name) name
+
+#define ENTRY(name) \
+        .global C_SYMBOL_NAME(name); \
+        .align 4;\
+        C_LABEL(name);\
+        .type name,@function;
+
+#define LOC(name)  . ## L ## name
+
+#define END(name) \
+        .size name, . - name
+
+#define ST_DIV0                 0x02
+
+ENTRY(.udiv)
+
+	! Ready to divide.  Compute size of quotient; scale comparand.
+	orcc	%o1, %g0, %o5
+	bne	1f
+	mov	%o0, %o3
+
+		! Divide by zero trap.  If it returns, return 0 (about as
+		! wrong as possible, but that is what SunOS does...).
+		ta	ST_DIV0
+		retl
+		clr	%o0
+
+1:
+	cmp	%o3, %o5			! if %o1 exceeds %o0, done
+	blu	LOC(got_result)		! (and algorithm fails otherwise)
+	clr	%o2
+	sethi	%hi(1 << (32 - 4 - 1)), %g1
+	cmp	%o3, %g1
+	blu	LOC(not_really_big)
+	clr	%o4
+
+	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+	! as our usual N-at-a-shot divide step will cause overflow and havoc.
+	! The number of bits in the result here is N*ITER+SC, where SC <= N.
+	! Compute ITER in an unorthodox manner: know we need to shift V into
+	! the top decade: so do not even bother to compare to R.
+	1:
+		cmp	%o5, %g1
+		bgeu	3f
+		mov	1, %g2
+		sll	%o5, 4, %o5
+		b	1b
+		add	%o4, 1, %o4
+
+	! Now compute %g2.
+	2:	addcc	%o5, %o5, %o5
+		bcc	LOC(not_too_big)
+		add	%g2, 1, %g2
+
+		! We get here if the %o1 overflowed while shifting.
+		! This means that %o3 has the high-order bit set.
+		! Restore %o5 and subtract from %o3.
+		sll	%g1, 4, %g1	! high order bit
+		srl	%o5, 1, %o5		! rest of %o5
+		add	%o5, %g1, %o5
+		b	LOC(do_single_div)
+		sub	%g2, 1, %g2
+
+	LOC(not_too_big):
+	3:	cmp	%o5, %o3
+		blu	2b
+		nop
+		be	LOC(do_single_div)
+		nop
+	/* NB: these are commented out in the V8-Sparc manual as well */
+	/* (I do not understand this) */
+	! %o5 > %o3: went too far: back up 1 step
+	!	srl	%o5, 1, %o5
+	!	dec	%g2
+	! do single-bit divide steps
+	!
+	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+	! first divide step without thinking.  BUT, the others are conditional,
+	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+	! order bit set in the first step, just falling into the regular
+	! division loop will mess up the first time around.
+	! So we unroll slightly...
+	LOC(do_single_div):
+		subcc	%g2, 1, %g2
+		bl	LOC(end_regular_divide)
+		nop
+		sub	%o3, %o5, %o3
+		mov	1, %o2
+		b	LOC(end_single_divloop)
+		nop
+	LOC(single_divloop):
+		sll	%o2, 1, %o2
+		bl	1f
+		srl	%o5, 1, %o5
+		! %o3 >= 0
+		sub	%o3, %o5, %o3
+		b	2f
+		add	%o2, 1, %o2
+	1:	! %o3 < 0
+		add	%o3, %o5, %o3
+		sub	%o2, 1, %o2
+	2:
+	LOC(end_single_divloop):
+		subcc	%g2, 1, %g2
+		bge	LOC(single_divloop)
+		tst	%o3
+		b,a	LOC(end_regular_divide)
+
+LOC(not_really_big):
+1:
+	sll	%o5, 4, %o5
+	cmp	%o5, %o3
+	bleu	1b
+	addcc	%o4, 1, %o4
+	be	LOC(got_result)
+	sub	%o4, 1, %o4
+
+	tst	%o3	! set up for initial iteration
+LOC(divloop):
+	sll	%o2, 4, %o2
+		! depth 1, accumulated bits 0
+	bl	LOC(1.16)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 2, accumulated bits 1
+	bl	LOC(2.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 3
+	bl	LOC(3.19)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 7
+	bl	LOC(4.23)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (7*2+1), %o2
+	
+LOC(4.23):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (7*2-1), %o2
+	
+	
+LOC(3.19):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 5
+	bl	LOC(4.21)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (5*2+1), %o2
+	
+LOC(4.21):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (5*2-1), %o2
+	
+	
+	
+LOC(2.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 1
+	bl	LOC(3.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 3
+	bl	LOC(4.19)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (3*2+1), %o2
+	
+LOC(4.19):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (3*2-1), %o2
+	
+	
+LOC(3.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 1
+	bl	LOC(4.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (1*2+1), %o2
+	
+LOC(4.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (1*2-1), %o2
+	
+	
+	
+	
+LOC(1.16):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 2, accumulated bits -1
+	bl	LOC(2.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -1
+	bl	LOC(3.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -1
+	bl	LOC(4.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-1*2+1), %o2
+	
+LOC(4.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-1*2-1), %o2
+	
+	
+LOC(3.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -3
+	bl	LOC(4.13)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-3*2+1), %o2
+	
+LOC(4.13):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-3*2-1), %o2
+	
+	
+	
+LOC(2.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -3
+	bl	LOC(3.13)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -5
+	bl	LOC(4.11)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-5*2+1), %o2
+	
+LOC(4.11):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-5*2-1), %o2
+	
+	
+LOC(3.13):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -7
+	bl	LOC(4.9)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-7*2+1), %o2
+	
+LOC(4.9):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-7*2-1), %o2
+	
+	
+	
+	
+	9:
+LOC(end_regular_divide):
+	subcc	%o4, 1, %o4
+	bge	LOC(divloop)
+	tst	%o3
+	bl,a	LOC(got_result)
+	! non-restoring fixup here (one instruction only!)
+	sub	%o2, 1, %o2
+
+
+LOC(got_result):
+
+	retl
+	mov %o2, %o0
+
+END(.udiv)
+
+#endif
diff --git a/mdk-stage1/dietlibc/sparc/umul.S b/mdk-stage1/dietlibc/sparc/umul.S
new file mode 100644
index 000000000..15038ab2a
--- /dev/null
+++ b/mdk-stage1/dietlibc/sparc/umul.S
@@ -0,0 +1,170 @@
+#ifdef __sparc__
+/*
+ * Unsigned multiply.  Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
+ * upper 32 bits of the 64-bit product).
+ *
+ * This code optimizes short (less than 13-bit) multiplies.  Short
+ * multiplies require 25 instruction cycles, and long ones require
+ * 45 instruction cycles.
+ *
+ * On return, overflow has occurred (%o1 is not zero) if and only if
+ * the Z condition code is clear, allowing, e.g., the following:
+ *
+ *	call	.umul
+ *	nop
+ *	bnz	overflow	(or tnz)
+ */
+
+#define C_LABEL(name) name:
+
+#define C_SYMBOL_NAME(name) name
+
+#define ENTRY(name) \
+        .global C_SYMBOL_NAME(name); \
+        .align 4;\
+        C_LABEL(name);\
+        .type name,@function;
+
+#define LOC(name)  . ## L ## name
+
+#define END(name) \
+        .size name, . - name
+
+ENTRY(.umul)
+	or	%o0, %o1, %o4
+	mov	%o0, %y			! multiplier -> Y
+	andncc	%o4, 0xfff, %g0		! test bits 12..31 of *both* args
+	be	LOC(mul_shortway)	! if zero, can do it the short way
+	 andcc	%g0, %g0, %o4		! zero the partial product; clear N & V
+
+	/*
+	 * Long multiply.  32 steps, followed by a final shift step.
+	 */
+	mulscc	%o4, %o1, %o4	! 1
+	mulscc	%o4, %o1, %o4	! 2
+	mulscc	%o4, %o1, %o4	! 3
+	mulscc	%o4, %o1, %o4	! 4
+	mulscc	%o4, %o1, %o4	! 5
+	mulscc	%o4, %o1, %o4	! 6
+	mulscc	%o4, %o1, %o4	! 7
+	mulscc	%o4, %o1, %o4	! 8
+	mulscc	%o4, %o1, %o4	! 9
+	mulscc	%o4, %o1, %o4	! 10
+	mulscc	%o4, %o1, %o4	! 11
+	mulscc	%o4, %o1, %o4	! 12
+	mulscc	%o4, %o1, %o4	! 13
+	mulscc	%o4, %o1, %o4	! 14
+	mulscc	%o4, %o1, %o4	! 15
+	mulscc	%o4, %o1, %o4	! 16
+	mulscc	%o4, %o1, %o4	! 17
+	mulscc	%o4, %o1, %o4	! 18
+	mulscc	%o4, %o1, %o4	! 19
+	mulscc	%o4, %o1, %o4	! 20
+	mulscc	%o4, %o1, %o4	! 21
+	mulscc	%o4, %o1, %o4	! 22
+	mulscc	%o4, %o1, %o4	! 23
+	mulscc	%o4, %o1, %o4	! 24
+	mulscc	%o4, %o1, %o4	! 25
+	mulscc	%o4, %o1, %o4	! 26
+	mulscc	%o4, %o1, %o4	! 27
+	mulscc	%o4, %o1, %o4	! 28
+	mulscc	%o4, %o1, %o4	! 29
+	mulscc	%o4, %o1, %o4	! 30
+	mulscc	%o4, %o1, %o4	! 31
+	mulscc	%o4, %o1, %o4	! 32
+	mulscc	%o4, %g0, %o4	! final shift
+
+	/*
+	 * Normally, with the shift-and-add approach, if both numbers are
+	 * positive you get the correct result.  With 32-bit two's-complement
+	 * numbers, -x is represented as
+	 *
+	 *		  x		    32
+	 *	( 2  -  ------ ) mod 2  *  2
+	 *		   32
+	 *		  2
+	 *
+	 * (the `mod 2' subtracts 1 from 1.bbbb).  To avoid lots of 2^32s,
+	 * we can treat this as if the radix point were just to the left
+	 * of the sign bit (multiply by 2^32), and get
+	 *
+	 *	-x  =  (2 - x) mod 2
+	 *
+	 * Then, ignoring the `mod 2's for convenience:
+	 *
+	 *   x *  y	= xy
+	 *  -x *  y	= 2y - xy
+	 *   x * -y	= 2x - xy
+	 *  -x * -y	= 4 - 2x - 2y + xy
+	 *
+	 * For signed multiplies, we subtract (x << 32) from the partial
+	 * product to fix this problem for negative multipliers (see mul.s).
+	 * Because of the way the shift into the partial product is calculated
+	 * (N xor V), this term is automatically removed for the multiplicand,
+	 * so we don't have to adjust.
+	 *
+	 * But for unsigned multiplies, the high order bit wasn't a sign bit,
+	 * and the correction is wrong.  So for unsigned multiplies where the
+	 * high order bit is one, we end up with xy - (y << 32).  To fix it
+	 * we add y << 32.
+	 */
+#if 0
+	tst	%o1
+	bl,a	1f		! if %o1 < 0 (high order bit = 1),
+	 add	%o4, %o0, %o4	! %o4 += %o0 (add y to upper half)
+1:	rd	%y, %o0		! get lower half of product
+	retl
+	 addcc	%o4, %g0, %o1	! put upper half in place and set Z for %o1==0
+#else
+	/* Faster code from tege@sics.se.  */
+	sra	%o1, 31, %o2	! make mask from sign bit
+	and	%o0, %o2, %o2	! %o2 = 0 or %o0, depending on sign of %o1
+	rd	%y, %o0		! get lower half of product
+	retl
+	 addcc	%o4, %o2, %o1	! add compensation and put upper half in place
+#endif
+
+LOC(mul_shortway):
+	/*
+	 * Short multiply.  12 steps, followed by a final shift step.
+	 * The resulting bits are off by 12 and (32-12) = 20 bit positions,
+	 * but there is no problem with %o0 being negative (unlike above),
+	 * and overflow is impossible (the answer is at most 24 bits long).
+	 */
+	mulscc	%o4, %o1, %o4	! 1
+	mulscc	%o4, %o1, %o4	! 2
+	mulscc	%o4, %o1, %o4	! 3
+	mulscc	%o4, %o1, %o4	! 4
+	mulscc	%o4, %o1, %o4	! 5
+	mulscc	%o4, %o1, %o4	! 6
+	mulscc	%o4, %o1, %o4	! 7
+	mulscc	%o4, %o1, %o4	! 8
+	mulscc	%o4, %o1, %o4	! 9
+	mulscc	%o4, %o1, %o4	! 10
+	mulscc	%o4, %o1, %o4	! 11
+	mulscc	%o4, %o1, %o4	! 12
+	mulscc	%o4, %g0, %o4	! final shift
+
+	/*
+	 * %o4 has 20 of the bits that should be in the result; %y has
+	 * the bottom 12 (as %y's top 12).  That is:
+	 *
+	 *	  %o4		    %y
+	 * +----------------+----------------+
+	 * | -12- |   -20-  | -12- |   -20-  |
+	 * +------(---------+------)---------+
+	 *	   -----result-----
+	 *
+	 * The 12 bits of %o4 left of the `result' area are all zero;
+	 * in fact, all top 20 bits of %o4 are zero.
+	 */
+
+	rd	%y, %o5
+	sll	%o4, 12, %o0	! shift middle bits left 12
+	srl	%o5, 20, %o5	! shift low bits right 20
+	or	%o5, %o0, %o0
+	retl
+	 addcc	%g0, %g0, %o1	! %o1 = zero, and set Z
+
+END(.umul)
+#endif
diff --git a/mdk-stage1/dietlibc/sparc/unified.S b/mdk-stage1/dietlibc/sparc/unified.S
new file mode 100644
index 000000000..53b1612a9
--- /dev/null
+++ b/mdk-stage1/dietlibc/sparc/unified.S
@@ -0,0 +1,28 @@
+#include <dietfeatures.h>
+
+.text
+.global __unified_syscall
+__unified_syscall:
+	ta	0x10
+
+	bcc	1f
+	save	%sp, -104, %sp
+
+	neg	%i0, %i0
+1:
+	add	%i0, 0xff, %l2
+	cmp	%l2, 0xfe
+	bgu	2f
+	neg	%i0, %l3
+#ifdef WANT_THREAD_SAVE
+	call	__errno_location
+	nop
+#else
+	sethi	%hi(errno), %o0
+	or	%o0, %lo(errno), %o0
+#endif
+	st	%l3, [ %o0 ]
+	mov	-1, %o0
+2:
+	ret
+	restore
diff --git a/mdk-stage1/dietlibc/sparc/urem.S b/mdk-stage1/dietlibc/sparc/urem.S
new file mode 100644
index 000000000..943cb7873
--- /dev/null
+++ b/mdk-stage1/dietlibc/sparc/urem.S
@@ -0,0 +1,362 @@
+#ifdef __sparc__
+   /* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .urem	name of function to generate
+ *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1
+ *  false		false=true => signed; false=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N		how many bits per iteration we try to get (4)
+ *  WORDSIZE	total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS	number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q		the partial quotient under development (initially 0)
+ *  R		the remainder so far, initially the dividend
+ *  ITER	number of main division loop iterations required;
+ *		equal to ceil(log2(quotient) / N).  Note that this
+ *		is the log base (2^N) of the quotient.
+ *  V		the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+
+
+
+#define C_LABEL(name) name:
+
+#define C_SYMBOL_NAME(name) name
+
+#define ENTRY(name) \
+        .global C_SYMBOL_NAME(name); \
+        .align 4;\
+        C_LABEL(name);\
+        .type name,@function;
+
+#define LOC(name)  . ## L ## name
+
+#define END(name) \
+        .size name, . - name
+
+#define ST_DIV0                 0x02
+
+ENTRY(.urem)
+
+	! Ready to divide.  Compute size of quotient; scale comparand.
+	orcc	%o1, %g0, %o5
+	bne	1f
+	mov	%o0, %o3
+
+		! Divide by zero trap.  If it returns, return 0 (about as
+		! wrong as possible, but that is what SunOS does...).
+		ta	ST_DIV0
+		retl
+		clr	%o0
+
+1:
+	cmp	%o3, %o5			! if %o1 exceeds %o0, done
+	blu	LOC(got_result)		! (and algorithm fails otherwise)
+	clr	%o2
+	sethi	%hi(1 << (32 - 4 - 1)), %g1
+	cmp	%o3, %g1
+	blu	LOC(not_really_big)
+	clr	%o4
+
+	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+	! as our usual N-at-a-shot divide step will cause overflow and havoc.
+	! The number of bits in the result here is N*ITER+SC, where SC <= N.
+	! Compute ITER in an unorthodox manner: know we need to shift V into
+	! the top decade: so do not even bother to compare to R.
+	1:
+		cmp	%o5, %g1
+		bgeu	3f
+		mov	1, %g2
+		sll	%o5, 4, %o5
+		b	1b
+		add	%o4, 1, %o4
+
+	! Now compute %g2.
+	2:	addcc	%o5, %o5, %o5
+		bcc	LOC(not_too_big)
+		add	%g2, 1, %g2
+
+		! We get here if the %o1 overflowed while shifting.
+		! This means that %o3 has the high-order bit set.
+		! Restore %o5 and subtract from %o3.
+		sll	%g1, 4, %g1	! high order bit
+		srl	%o5, 1, %o5		! rest of %o5
+		add	%o5, %g1, %o5
+		b	LOC(do_single_div)
+		sub	%g2, 1, %g2
+
+	LOC(not_too_big):
+	3:	cmp	%o5, %o3
+		blu	2b
+		nop
+		be	LOC(do_single_div)
+		nop
+	/* NB: these are commented out in the V8-Sparc manual as well */
+	/* (I do not understand this) */
+	! %o5 > %o3: went too far: back up 1 step
+	!	srl	%o5, 1, %o5
+	!	dec	%g2
+	! do single-bit divide steps
+	!
+	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+	! first divide step without thinking.  BUT, the others are conditional,
+	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+	! order bit set in the first step, just falling into the regular
+	! division loop will mess up the first time around.
+	! So we unroll slightly...
+	LOC(do_single_div):
+		subcc	%g2, 1, %g2
+		bl	LOC(end_regular_divide)
+		nop
+		sub	%o3, %o5, %o3
+		mov	1, %o2
+		b	LOC(end_single_divloop)
+		nop
+	LOC(single_divloop):
+		sll	%o2, 1, %o2
+		bl	1f
+		srl	%o5, 1, %o5
+		! %o3 >= 0
+		sub	%o3, %o5, %o3
+		b	2f
+		add	%o2, 1, %o2
+	1:	! %o3 < 0
+		add	%o3, %o5, %o3
+		sub	%o2, 1, %o2
+	2:
+	LOC(end_single_divloop):
+		subcc	%g2, 1, %g2
+		bge	LOC(single_divloop)
+		tst	%o3
+		b,a	LOC(end_regular_divide)
+
+LOC(not_really_big):
+1:
+	sll	%o5, 4, %o5
+	cmp	%o5, %o3
+	bleu	1b
+	addcc	%o4, 1, %o4
+	be	LOC(got_result)
+	sub	%o4, 1, %o4
+
+	tst	%o3	! set up for initial iteration
+LOC(divloop):
+	sll	%o2, 4, %o2
+		! depth 1, accumulated bits 0
+	bl	LOC(1.16)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 2, accumulated bits 1
+	bl	LOC(2.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 3
+	bl	LOC(3.19)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 7
+	bl	LOC(4.23)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (7*2+1), %o2
+	
+LOC(4.23):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (7*2-1), %o2
+	
+	
+LOC(3.19):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 5
+	bl	LOC(4.21)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (5*2+1), %o2
+	
+LOC(4.21):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (5*2-1), %o2
+	
+	
+	
+LOC(2.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 1
+	bl	LOC(3.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 3
+	bl	LOC(4.19)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (3*2+1), %o2
+	
+LOC(4.19):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (3*2-1), %o2
+	
+	
+LOC(3.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 1
+	bl	LOC(4.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (1*2+1), %o2
+	
+LOC(4.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (1*2-1), %o2
+	
+	
+	
+	
+LOC(1.16):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 2, accumulated bits -1
+	bl	LOC(2.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -1
+	bl	LOC(3.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -1
+	bl	LOC(4.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-1*2+1), %o2
+	
+LOC(4.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-1*2-1), %o2
+	
+	
+LOC(3.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -3
+	bl	LOC(4.13)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-3*2+1), %o2
+	
+LOC(4.13):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-3*2-1), %o2
+	
+	
+	
+LOC(2.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -3
+	bl	LOC(3.13)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -5
+	bl	LOC(4.11)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-5*2+1), %o2
+	
+LOC(4.11):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-5*2-1), %o2
+	
+	
+LOC(3.13):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -7
+	bl	LOC(4.9)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-7*2+1), %o2
+	
+LOC(4.9):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-7*2-1), %o2
+	
+	
+	
+	
+	9:
+LOC(end_regular_divide):
+	subcc	%o4, 1, %o4
+	bge	LOC(divloop)
+	tst	%o3
+	bl,a	LOC(got_result)
+	! non-restoring fixup here (one instruction only!)
+	add	%o3, %o1, %o3
+
+
+LOC(got_result):
+
+	retl
+	mov %o3, %o0
+
+END(.urem)
+#endif