Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
linux
Commits
878da831
Commit
878da831
authored
Apr 25, 2002
by
David Mosberger
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ia64: Add optimized ip_fast_csum() by Ken Chen and merge his cleanups
to do_csum.S.
parent
d2c4281c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
120 additions
and
36 deletions
+120
-36
arch/ia64/lib/Makefile
arch/ia64/lib/Makefile
+1
-1
arch/ia64/lib/checksum.c
arch/ia64/lib/checksum.c
+13
-25
arch/ia64/lib/do_csum.S
arch/ia64/lib/do_csum.S
+17
-10
arch/ia64/lib/ip_fast_csum.S
arch/ia64/lib/ip_fast_csum.S
+89
-0
No files found.
arch/ia64/lib/Makefile
View file @
878da831
...
...
@@ -13,7 +13,7 @@ obj-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o
\
checksum.o clear_page.o csum_partial_copy.o copy_page.o
\
copy_user.o clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o
\
flush.o io.o
do_csum.o
\
flush.o io.o
ip_fast_csum.o do_csum.o
\
memcpy.o memset.o strlen.o swiotlb.o
obj-$(CONFIG_ITANIUM)
+=
copy_page.o
...
...
arch/ia64/lib/checksum.c
View file @
878da831
...
...
@@ -15,7 +15,7 @@
#include <asm/byteorder.h>
static
inline
unsigned
short
from64to16
(
unsigned
long
x
)
from64to16
(
unsigned
long
x
)
{
/* add up 32-bit words for 33 bits */
x
=
(
x
&
0xffffffff
)
+
(
x
>>
32
);
...
...
@@ -32,22 +32,17 @@ from64to16(unsigned long x)
* computes the checksum of the TCP/UDP pseudo-header
* returns a 16-bit checksum, already complemented.
*/
unsigned
short
int
csum_tcpudp_magic
(
unsigned
long
saddr
,
unsigned
long
daddr
,
unsigned
short
len
,
unsigned
short
proto
,
unsigned
int
sum
)
unsigned
short
int
csum_tcpudp_magic
(
unsigned
long
saddr
,
unsigned
long
daddr
,
unsigned
short
len
,
unsigned
short
proto
,
unsigned
int
sum
)
{
return
~
from64to16
(
saddr
+
daddr
+
sum
+
((
unsigned
long
)
ntohs
(
len
)
<<
16
)
+
((
unsigned
long
)
proto
<<
8
));
return
~
from64to16
(
saddr
+
daddr
+
sum
+
((
unsigned
long
)
ntohs
(
len
)
<<
16
)
+
((
unsigned
long
)
proto
<<
8
));
}
unsigned
int
csum_tcpudp_nofold
(
unsigned
long
saddr
,
unsigned
long
daddr
,
unsigned
short
len
,
unsigned
short
proto
,
unsigned
int
sum
)
unsigned
int
csum_tcpudp_nofold
(
unsigned
long
saddr
,
unsigned
long
daddr
,
unsigned
short
len
,
unsigned
short
proto
,
unsigned
int
sum
)
{
unsigned
long
result
;
...
...
@@ -65,15 +60,6 @@ unsigned int csum_tcpudp_nofold(unsigned long saddr,
extern
unsigned
long
do_csum
(
const
unsigned
char
*
,
long
);
/*
* This is a version of ip_compute_csum() optimized for IP headers,
* which always checksum on 4 octet boundaries.
*/
unsigned
short
ip_fast_csum
(
unsigned
char
*
iph
,
unsigned
int
ihl
)
{
return
~
do_csum
(
iph
,
ihl
*
4
);
}
/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
...
...
@@ -86,7 +72,8 @@ unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl)
*
* it's best to have buff aligned on a 32-bit boundary
*/
unsigned
int
csum_partial
(
const
unsigned
char
*
buff
,
int
len
,
unsigned
int
sum
)
unsigned
int
csum_partial
(
const
unsigned
char
*
buff
,
int
len
,
unsigned
int
sum
)
{
unsigned
long
result
=
do_csum
(
buff
,
len
);
...
...
@@ -102,7 +89,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
*/
unsigned
short
ip_compute_csum
(
unsigned
char
*
buff
,
int
len
)
unsigned
short
ip_compute_csum
(
unsigned
char
*
buff
,
int
len
)
{
return
~
do_csum
(
buff
,
len
);
}
arch/ia64/lib/do_csum.S
View file @
878da831
...
...
@@ -11,6 +11,9 @@
*
Copyright
(
C
)
1999
,
2001
-
2002
Hewlett
-
Packard
Co
*
Stephane
Eranian
<
eranian
@
hpl
.
hp
.
com
>
*
*
02
/
04
/
22
Ken
Chen
<
kenneth
.
w
.
chen
@
intel
.
com
>
*
Data
locality
study
on
the
checksum
buffer
.
*
More
optimization
cleanup
-
remove
excessive
stop
bits
.
*
02
/
04
/
08
David
Mosberger
<
davidm
@
hpl
.
hp
.
com
>
*
More
cleanup
and
tuning
.
*
01
/
04
/
18
Jun
Nakajima
<
jun
.
nakajima
@
intel
.
com
>
...
...
@@ -80,6 +83,12 @@
//
type
of
packet
or
alignment
we
get
.
Like
the
ip_fast_csum
()
routine
//
where
we
know
we
have
at
least
20
bytes
worth
of
data
to
checksum
.
//
-
Do
a
better
job
of
handling
small
packets
.
//
-
Note
on
prefetching
:
it
was
found
that
under
various
load
,
i
.
e
.
ftp
read
/
write
,
//
nfs
read
/
write
,
the
L1
cache
hit
rate
is
at
60
%
and
L2
cache
hit
rate
is
at
99
.8
%
//
on
the
data
that
buffer
points
to
(
partly
because
the
checksum
is
often
preceded
by
//
a
copy_from_user
())
.
This
finding
indiate
that
lfetch
will
not
be
beneficial
since
//
the
data
is
already
in
the
cache
.
//
#define saved_pfs r11
#define hmask r16
...
...
@@ -117,7 +126,7 @@
GLOBAL_ENTRY
(
do_csum
)
.
prologue
.
save
ar
.
pfs
,
saved_pfs
alloc
saved_pfs
=
ar
.
pfs
,
2
,
16
,
1
,
16
alloc
saved_pfs
=
ar
.
pfs
,
2
,
16
,
0
,
16
.
rotr
word1
[
4
],
word2
[
4
],
result1
[
LOAD_LATENCY
+
2
],
result2
[
LOAD_LATENCY
+
2
]
.
rotp
p
[
PIPE_DEPTH
],
pC1
[
2
],
pC2
[
2
]
mov
ret0
=
r0
//
in
case
we
have
zero
length
...
...
@@ -197,22 +206,21 @@ GLOBAL_ENTRY(do_csum)
//
Calculate
the
checksum
loading
two
8
-
byte
words
per
loop
.
//
.
do_csum16
:
mov
saved_lc
=
ar
.
lc
shr.u
count
=
count
,
1
//
we
do
16
bytes
per
loop
;;
cmp.eq
p9
,
p10
=
r0
,
count
//
if
(
count
==
0
)
adds
count
=-
1
,
count
brp.loop.imp
1
f
,
2
f
;;
cmp.eq
p9
,
p10
=
r0
,
count
//
if
(
count
==
0
)
mov
ar
.
ec
=
PIPE_DEPTH
mov
ar
.
lc
=
count
//
set
lc
//
result1
[
0
]
must
be
initialized
in
advance
.
mov
result2
[
0
]=
r0
mov
pr
.
rot
=
1
<<
16
mov
carry1
=
r0
mov
carry2
=
r0
add
first2
=
8
,
first1
;;
mov
ar
.
lc
=
count
//
set
lc
mov
pr
.
rot
=
1
<<
16
//
result1
[
0
]
must
be
initialized
in
advance
.
mov
result2
[
0
]=
r0
(
p9
)
br.cond.sptk
.
do_csum_exit
;;
.
align
32
...
...
@@ -223,7 +231,7 @@ GLOBAL_ENTRY(do_csum)
(
pC2
[
1
])
adds
carry2
=1,
carry2
(
ELD
)
add
result1
[
LOAD_LATENCY
-
1
]=
result1
[
LOAD_LATENCY
],
word1
[
LOAD_LATENCY
]
(
ELD
)
add
result2
[
LOAD_LATENCY
-
1
]=
result2
[
LOAD_LATENCY
],
word2
[
LOAD_LATENCY
]
[
2
:]
2
:
(
p
[
0
])
ld8
word1
[
0
]=[
first1
],
16
(
p
[
0
])
ld8
word2
[
0
]=[
first2
],
16
br.ctop.sptk
1
b
...
...
@@ -246,7 +254,6 @@ GLOBAL_ENTRY(do_csum)
cmp.ltu
p6
,
p0
=
result1
[
0
],
result2
[
LOAD_LATENCY
+
1
]
;;
(
p6
)
adds
result1
[
0
]=
1
,
result1
[
0
]
;;
.
do_csum_exit
:
//
//
now
fold
64
into
16
bits
taking
care
of
carry
...
...
arch/ia64/lib/ip_fast_csum.S
0 → 100644
View file @
878da831
/*
*
Optmized
version
of
the
ip_fast_csum
()
function
*
Used
for
calculating
IP
header
checksum
*
*
Return
:
16
bit
checksum
,
complemented
*
*
Inputs
:
*
in0
:
address
of
buffer
to
checksum
(
char
*)
*
in1
:
length
of
the
buffer
(
int
)
*
*
Copyright
(
C
)
2002
Intel
Corp
.
*
Copyright
(
C
)
2002
Ken
Chen
<
kenneth
.
w
.
chen
@
intel
.
com
>
*/
#include <asm/asmmacro.h>
/*
*
Since
we
know
that
most
likely
this
function
is
called
with
buf
aligned
*
on
4
-
byte
boundary
and
20
bytes
in
length
,
we
can
execution
rather
quickly
*
versus
calling
generic
version
of
do_csum
,
which
has
lots
of
overhead
in
*
handling
various
alignments
and
sizes
.
However
,
due
to
lack
of
constrains
*
put
on
the
function
input
argument
,
cases
with
alignment
not
on
4
-
byte
or
*
size
not
equal
to
20
bytes
will
be
handled
by
the
generic
do_csum
function
.
*/
#define in0 r32
#define in1 r33
#define ret0 r8
GLOBAL_ENTRY
(
ip_fast_csum
)
.
body
cmp.ne
p6
,
p7
=
5
,
in1
//
size
other
than
20
byte
?
and
r14
=
3
,
in0
//
is
it
aligned
on
4
-
byte
?
add
r15
=
4
,
in0
//
second
source
pointer
;;
cmp.ne.or.andcm
p6
,
p7
=
r14
,
r0
;;
(
p7
)
ld4
r20
=[
in0
],
8
(
p7
)
ld4
r21
=[
r15
],
8
(
p6
)
br.spnt
.
generic
;;
ld4
r22
=[
in0
],
8
ld4
r23
=[
r15
],
8
;;
ld4
r24
=[
in0
]
add
r20
=
r20
,
r21
add
r22
=
r22
,
r23
;;
add
r20
=
r20
,
r22
;;
add
r20
=
r20
,
r24
;;
shr.u
ret0
=
r20
,
16
//
now
need
to
add
the
carry
zxt2
r20
=
r20
;;
add
r20
=
ret0
,
r20
;;
shr.u
ret0
=
r20
,
16
//
add
carry
again
zxt2
r20
=
r20
;;
add
r20
=
ret0
,
r20
;;
shr.u
ret0
=
r20
,
16
zxt2
r20
=
r20
;;
add
r20
=
ret0
,
r20
;;
andcm
ret0
=-
1
,
r20
.
restore
sp
//
reset
frame
state
br.ret.sptk.many
b0
;;
.
generic
:
.
prologue
.
save
ar
.
pfs
,
r35
alloc
r35
=
ar
.
pfs
,
2
,
2
,
2
,
0
.
save
rp
,
r34
mov
r34
=
b0
.
body
dep.z
out1
=
in1
,
2
,
30
mov
out0
=
in0
;;
br.call.sptk.many
b0
=
do_csum
;;
andcm
ret0
=-
1
,
ret0
mov
ar
.
pfs
=
r35
mov
b0
=
r34
br.ret.sptk.many
b0
END
(
ip_fast_csum
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment