Coda File System

Re: codasrv crash on netbsd/sparc64 3.0

From: Sean Caron <caron.sean_at_gmail.com>
Date: Tue, 25 Apr 2006 14:50:10 -0400
Hi Greg,

Thanks for the patck. It fixed the signal 10 error, and substituted it for a
signal 11 error! Here's the scoop:

Contents of /vice/srv/SrvLog:

Partition /vicepa: inodes in use: 0, total: 2097152.
14:29:49 Partition /vicepa: 63091568K available (minfree=4%), 51727984K
free.
14:29:49 The server (pid 8910) can be controlled using volutil commands
14:29:49 "volutil -help" will give you a list of these commands
14:29:49 If desperate,
                "kill -SIGWINCH 8910" will increase debugging level
14:29:49        "kill -SIGUSR2 8910" will set debugging level to zero
14:29:49        "kill -9 8910" will kill a runaway server
14:29:49 ****** FILE SERVER INTERRUPTED BY SIGNAL 11 ******
14:29:49 ****** Aborting outstanding transactions, stand by...
14:29:49 Uncommitted transactions: 0
14:29:49 Uncommitted transactions: 0
14:29:49 Committing suicide now ........

Contents of /vice/srv/SrvErr

Assertion failed: 0, file "srv.cc", line 302
EXITING! Bye!

Running it through gdb with flags -d 1 and backtracing shows:

blossom: {240} gdb /usr/local/sbin/codasrv

(gdb) run -d 1
Starting program: /usr/local/sbin/codasrv -d 1
Setting debuglevel to 1

Program received signal SIGSEGV, Segmentation fault.
0x40403364 in bcopy () from /usr/local/lib/libc.so.12
(gdb) bt
#0  0x40403364 in bcopy () from /usr/local/lib/libc.so.12
#1  0x00088bc0 in readints (f=0xffffffff, a=0xffffc568, b=0xffffc564,
pos=2659)
    at rwcdb_pack.h:73
#2  0x00087d4c in rwcdb_find (c=0x129000,
    k=0xe86e0 "NAMESystem:Administrators", klen=25) at rwcdb.c:272
#3  0x0008731c in PDB_db_read (h=0x129000, id=0,
    name=0xffffffff <Error reading address 0xffffffff: Invalid argument>,
    data=0xffffc654, size=0xffffc650) at pdbdb.c:288
#4  0x00086230 in PDB_readProfile_byname (h=0x129000,
    name=0x94710 "System:Administrators", r=0xffffc6c0) at pdbprofile.c:107
#5  0x000858e0 in PDB_lookupByName (name=0x94710 "System:Administrators",
    id=0xcf860) at pdb.c:385
#6  0x000848e4 in AL_NameToId (
    Name=0xffffffff <Error reading address 0xffffffff: Invalid argument>,
    Id=0xcf860) at alprocs.c:429
#7  0x0001401c in main (argc=-1, argv=0xcf860) at srv.cc:483
#8  0x00013840 in ___start ()
(gdb) up
#1  0x00088bc0 in readints (f=0xffffffff, a=0xffffc568, b=0xffffc564,
pos=2659)
    at rwcdb_pack.h:73
73              bcopy(&t, buf, sizeof(struct rwcdb_tuple));
Current language:  auto; currently c
(gdb) up
#2  0x00087d4c in rwcdb_find (c=0x129000,
    k=0xe86e0 "NAMESystem:Administrators", klen=25) at rwcdb.c:272
272             if (readints(&c->rf, &hash2, &pos, cur_pos))
(gdb) up
#3  0x0008731c in PDB_db_read (h=0x129000, id=0,
    name=0xffffffff <Error reading address 0xffffffff: Invalid argument>,
    data=0xffffc654, size=0xffffc650) at pdbdb.c:288
288                     rc = rwcdb_find(&h->main, namekey, strlen(namekey));
(gdb) up
#4  0x00086230 in PDB_readProfile_byname (h=0x129000,
    name=0x94710 "System:Administrators", r=0xffffc6c0) at pdbprofile.c:107
107             PDB_db_read(h, 0, name, &data, &size);
(gdb) up
#5  0x000858e0 in PDB_lookupByName (name=0x94710 "System:Administrators",
    id=0xcf860) at pdb.c:385
385             PDB_readProfile_byname(h, name, &r);
(gdb) up
#6  0x000848e4 in AL_NameToId (
    Name=0xffffffff <Error reading address 0xffffffff: Invalid argument>,
    Id=0xcf860) at alprocs.c:429
429             PDB_lookupByName(Name, (int32_t *) Id);
(gdb) up
#7  0x0001401c in main (argc=-1, argv=0xcf860) at srv.cc:483
483         if (AL_NameToId(PRS_ADMINGROUP, &SystemId) ||
Current language:  auto; currently c++
(gdb) up
#8  0x00013840 in ___start ()
(gdb) i frame
Stack level 8, frame at 0xffffc990:
 pc = 0x13840 in ___start; saved pc 0x13794
 caller of frame at 0xffffc928
 Arglist at 0xffffc990, args:
 Locals at 0xffffc990, Previous frame's sp in sp
(gdb) list
478         DIR_Init(DIR_DATA_IN_VM);
479
480         stat(CODADB, &buff);
481         pdbtime = (int)buff.st_mtime;
482         CODA_ASSERT(AL_Initialize(AL_VERSION) == 0);
483         if (AL_NameToId(PRS_ADMINGROUP, &SystemId) ||
484             AL_NameToId(PRS_ANYUSERGROUP, &AnyUserId)) {
485             SLog(0, "Failed to find '" PRS_ADMINGROUP "' or '"
PRS_ANYUSERGROUP
486                     "' in the pdb database.");
487             CODA_ASSERT(0 && "check pdb database");
(gdb)


Any thoughts?

Thanks, Sean
scaron_at_umich.edu


On 4/25/06, Greg Troxel <gdt_at_ir.bbn.com> wrote:
>
> The code in rwcdb_pack.h doesn't check for alignment.
> Try this:
>
> --- rwcdb_pack.h.~1.4.~ 2005-06-20 08:45:54.000000000 -0400
> +++ rwcdb_pack.h        2006-04-25 13:24:23.000000000 -0400
> @@ -59,6 +59,7 @@
> static __inline__ void packints(char *buf, const u_int32_t a, const
> u_int32_t b)
> {
>      struct rwcdb_tuple *p = (struct rwcdb_tuple *)buf;
> +    /* XXX alignment */
>      p->a = SWAP_OUT(a);
>      p->b = SWAP_OUT(b);
> }
> @@ -66,8 +67,16 @@
> static __inline__ void unpackints(char *buf, u_int32_t *a, u_int32_t *b)
> {
>      struct rwcdb_tuple *p = (struct rwcdb_tuple *)buf;
> -    *a = SWAP_IN(p->a);
> -    *b = SWAP_IN(p->b);
> +    /* XXX cast is not right */
> +    if ((long) p & 0x3) {
> +      struct rwcdb_tuple t;
> +      bcopy(&t, buf, sizeof(struct rwcdb_tuple));
> +      *a = SWAP_IN(t.a);
> +      *b = SWAP_IN(t.b);
> +    } else {
> +      *a = SWAP_IN(p->a);
> +      *b = SWAP_IN(p->b);
> +    }
> }
>
> #endif /* _RWCDB_PACK_H_ */
>
> --
>         Greg Troxel <gdt_at_ir.bbn.com>
>
Received on 2006-04-25 14:52:25