On pre-g3/g4 it was often a good idea to convert strategically from C to asm. But I would never 'write from scratch' in asm; I'd make the code in C first, disassemble it and see how bad/good it was. VERY often just changing the C code a bit was enough to get the optimiser to do the 'right thing' -- but even if not, I'd start with the original disassembly, convert it to an asm() statement, and tweak it from there, making sure I'd kept the C code as a 'master'. This last thing helped my ass many times over when processor changed over time.
On g3, and especially G4/5 it became a lot 'harder' to write assembly that was faster than C, mostly because of the pipeline and the ram latency. It was still a good idea if you wanted do stuff like altivec etc, but even in scalar code I'd often apply the same methods as before, just had to bench a hell of a lot more as something that 'looked' faster wasn't necessarily so.
Here's a piece of a PCI audio driver I wrote all the way back then, for OSX. It converts audio back/forth between (little endian) 24 bits integer and floating point, 8 channels at a time. All the tricks of the book were used here, but as you can see, very little 'assembly'. The important bits were in fact writing a (perl) script to find the 'right' meshing of operations to make the pipeline happy.
static inline double __clip( register double B )
{
register double result;
asm( "fctiw %0, %1" : "=f" (result) : "f" (B) );
return result;
}
void F32L24_48_8(double *v, float *ii, UInt32 *oo, long count)
{
register double v0=v[0],v1=v[1],v2=v[2],v3=v[3],v4=v[4],v5=v[5],v6=v[6],v7=v[7];
// better see that with tabs == 4!
register double scale = 2147483648.0;
#if FASTPLAY
#define _load(i) s##i = ii[i]
#define _volc(i) s##i *= v##i
#define _clip(i) __clip(s##i)
#define _d2l1(i) s##i *= scale
#define _d2l2(i) o##i = (SInt32)s##i
#define _stor(i) __asm__( "stwbrx %0, %1, %2" : : "r" (o##i), "b%" (i << 2), "r" (oo) : "memory" )
#else
// this is the equivalent, without using assembly
#define _load(i) s##i = ii[i]
#define _volc(i) s##i *= v##i
#define _clip(i) if (s##i > 1.0) s##i = 1.0; else if (s##i < -1.0) s##i = -1.0
#define _d2l1(i) o##i = (SInt32)(s##i * scale)
#define _d2l2(i) o##i = (((o##i >> 8) & 0xff) << 16) | (((o##i >> 16) & 0xff) << 8) | (((o##i >> 24) & 0xff))
#define _stor(i) oo[i] = o##i
#endif
register double s0,s1,s2,s3,s4,s5,s6,s7;
register UInt32 o0,o1,o2,o3,o4,o5,o6,o7;
while (count--) {
// staged pipeline 6x8
_load(0);
_load(1);
_load(2);_volc(0);
_load(3);_volc(1);_clip(0);
_load(4);_volc(2);_clip(1);_d2l1(0);
_load(5);_volc(3);_clip(2);_d2l1(1);_d2l2(0);
_load(6);_volc(4);_clip(3);_d2l1(2);_d2l2(1);
_load(7);_volc(5);_clip(4);_d2l1(3);_d2l2(2);_stor(0);
_volc(6);_clip(5);_d2l1(4);_d2l2(3);_stor(1);
_volc(7);_clip(6);_d2l1(5);_d2l2(4);_stor(2);
_clip(7);_d2l1(6);_d2l2(5);_stor(3);
_d2l1(7);_d2l2(6);_stor(4);
_d2l2(7);_stor(5);
_stor(6);
_stor(7);
ii += 8;
oo += NUM_CHANNELS_OUT;
}
}