-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunix-v6-trouble-1.html
1155 lines (1023 loc) · 69.1 KB
/
unix-v6-trouble-1.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>fritzm.github.io - PDP-11/45: V6 Unix Troubleshooting</title>
<meta name="description" content="">
<meta name="author" content="Fritz Mueller">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
<!--[if lt IE 9]>
<script src="https://fritzm.github.io/theme/html5.js"></script>
<![endif]-->
<!-- Le styles -->
<link href="https://fritzm.github.io/theme/bootstrap.min.css" rel="stylesheet">
<link href="https://fritzm.github.io/theme/bootstrap.min.responsive.css" rel="stylesheet">
<link href="https://fritzm.github.io/theme/local.css" rel="stylesheet">
<link href="https://fritzm.github.io/theme/pygments.css" rel="stylesheet">
<!-- Photoswipe -->
<link rel="stylesheet" href="https://fritzm.github.io/theme/photoswipe.css">
<link rel="stylesheet" href="https://fritzm.github.io/theme/default-skin/default-skin.css">
<script src="https://fritzm.github.io/theme/photoswipe.min.js"></script>
<script src="https://fritzm.github.io/theme/photoswipe-ui-default.min.js"></script>
<script src="https://fritzm.github.io/galleries.js"></script>
<script type="text/javascript">
var pswipe = function(gname, index) {
var pswpElement = document.querySelectorAll('.pswp')[0];
var items = galleries[gname];
var options = { index: index };
var gallery = new PhotoSwipe(pswpElement, PhotoSwipeUI_Default, items, options);
gallery.init();
};
</script>
<!-- So Firefox can bookmark->"abo this site" -->
<link href="https://fritzm.github.io/feeds/all.rss.xml" rel="alternate" title="fritzm.github.io" type="application/rss+xml">
</head>
<body>
<div class="navbar">
<div class="navbar-inner">
<div class="container">
<a class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</a>
<a class="brand" href="https://fritzm.github.io">fritzm.github.io</a>
<div class="nav-collapse">
<ul class="nav">
</ul>
</div>
</div>
</div>
</div>
<div class="container">
<div class="content">
<div class="row">
<div class="span9">
<div class='article'>
<div class="content-title">
<h1>PDP-11/45: V6 Unix Troubleshooting</h1>
Sat 24 October 2020
by <a class="url fn" href="https://fritzm.github.io/author/fritz-mueller.html">Fritz Mueller</a>
</div>
<div><p><em>[A catch-up article, documenting discoveries of Jan/Feb 2019]</em></p>
<p>In early 2019, I made a V6 Unix pack from the Ken Wellsch tape image, as mentioned in <a href="https://fritzm.github.io/unix-and-ms11.html">this blog
entry</a>. It booted on my machine, but dumped core on the first <code>ls</code> in single-user
mode, or as soon as I did any heavy lifting in multi-user mode.</p>
<p>The following is the first part of a chronology of the troubleshooting campaign that took place over the next
month and a half, culminating in a smoking gun hardware fix and successful operation of V6 Unix on the
machine. This was largely a collaborative effort between Noel Chiappa an myself via direct email
correspondence, though help was received from others via the cctalk mailing list as well.</p>
<h3>January 8-9</h3>
<p>Initial experiments. Described the <code>ls</code> crashes to Noel. He theorizes that <code>ls</code> works in one case and
crashes in another is because it lands in a different spot in memory in each case.</p>
<p>Luckily, a subsequent <code>od</code> on the core file does not crash, and a core file is successfully extracted:</p>
<div class="highlight"><pre><span></span>140004 000000 141710 141724
$DK
@rkunix
mem = 1035
RESTRICTED RIGHTS
Use, duplication or disclosure is subject to
restrictions stated in Contract with Western
Electric Company, Inc.
# LS
MEMORY FAULT -- CORE DUMPED
# OD CORE
0000000 141552 141562 000000 000000 000000 000000 000000 000000
0000020 000000
0000060 000000 000000 000000 000001 000000 000000 063260 140076
0000100 001700 000000 000104 066112 067543 062562 000000 000000
0000120 000000 000000 000000 060221 000567 067543 062562 000000
0000140 000000 000000 000000 000000 066112 000000 000020 000000
0000160 000000 000000 000000 000000 177701 000000 000020 000000
0000200 000000 000000 000000 000000 177701 041402 016006 000000
0000220 000000 000000 000000 000000 066016 041402 016006 000000
0000240 000000 000000 000000 000000 066016 075120 075120 075120
0000260 000000
0000300 000000 000000 000000 000000 000013 010400 001050 002366
0000320 000000 000104 000035 000024 000000 141732 141742 141664
0000340 141674 000000 000000 000000 000000 000000 000000 000000
0000360 000000
0000400 000000 000000 000000 000000 000000 000000 000012 000000
0000420 000000 000000 000000 141772 000000 000000 000000 000000
0000440 000000
0001500 000000 025334 003602 001236 025334 003602 002454 003602
0001520 063260 177716 000000 141542 016070 001176 000000 003602
0001540 063260 177716 000000 141562 016070 001176 066352 030300
0001560 063260 025334 003602 077572 000013 107564 141626 000512
0001600 000000 141604 141616 000300 074616 025334 003602 000217
0001620 000203 107404 020276 000512 000000 141634 141640 003602
0001640 000007 000135 107454 141662 014314 003602 066352 005674
0001660 000000 141712 013640 074616 000000 001000 000000 000000
0001700 001000 074616 063260 066352 000013 141726 023730 066352
0001720 063260 000000 000013 141742 023502 003602 000000 177760
0001740 000013 141756 022050 000013 000000 000000 000000 000034
0001760 000444 000031 177760 000000 030351 177770 010210 170010
0002000 000001 177777 177777 023436 023436 020264 000162 000262
0002020 000262 000202 000262 000256 000210 000262 000250 000262
0002040 000262 000216 000262 000262 000262 000262 000262 000224
0002060 000170 000234 000242 000003 100000 000144 040000 000142
0002100 020000 000143 000055 000001 000400 000162 000055 000001
0002120 000200 000167 000055 000002 004000 000163 000100 000170
0002140 000055 000001 000040 000162 000055 000001 000020 000167
0002160 000055 000002 002000 000163 000010 000170 000055 000001
0002200 000004 000162 000055 000001 000002 000167 000055 000001
0002220 000001 000170 000055 000001 010000 000164 000040 020066
0002240 020106 020116 020126 020142 020152 020162 020176 020206
0002260 020216 020226 000056 062457 061564 070057 071541 073563
0002300 000144 062457 061564 063457 067562 070165 005000 071445
0002320 005072 072000 072157 066141 022440 005144 022400 062065
0002340 000040 031045 020144 022400 033055 033056 000163 026445
0002360 062066 022400 062063 022454 062063 022400 071467 020000
0002400 026445 027067 071467 022440 032055 032056 020163 020000
0002420 026445 031061 030456 071462 000040 032045 020144 022400
0002440 005163 022400 030456 071464 000012 071445 072440 071156
0002460 060545 060544 066142 005145 022400 020163 067556 020164
0002500 067546 067165 005144 000000 003750 000144 004076 000157
0002520 004070 000170 004172 000146 004210 000145 004026 000143
0002540 004044 000163 003764 000154 004226 000162 000000 000000
0002560 177774 177760 177775 177770 104404 022376 000000 104405
0002600 000000 000000 104403 000000 001000 104405 000000 000000
0002620 104421 000000 023436 104423 000000 000000 104422 000000
0002640 000000 000037 000034 000037 000036 000037 000036 000037
0002660 000037 000036 000037 000036 000037 043120 020712 020716
0002700 000001 000005 000515 000072 000457 051505 000124 042105
0002720 000124 060504 020171 067515 020156 030060 030040 035060
0002740 030060 030072 020060 034461 030060 000012 072523 046556
0002760 067157 072524 053545 062145 064124 043165 064562 060523
0003000 000164 060512 043156 061145 060515 040562 071160 060515
0003020 045171 067165 072512 040554 063565 062523 047560 072143
0003040 067516 042166 061545 000000 000000 000000 000000 000000
0003060 000000
0010060 000000 000020 000001 177770 177774 177777 071554 000000
0010100
#
</pre></div>
<p>Noel prepares to analyze the core file (block quotes here and further below taken from email correspondence):</p>
<blockquote>
<p>I just checked, and the binary for the 'ls' command is what's called 'pure code'; i.e. the instructions are
in a separate (potentially shared) block of memory from the process' data (un-shared).</p>
</blockquote>
<hr>
<blockquote>
<p>On another front, that error message ("Memory error") is produced when a process gets a 'memory management
trap' (trap to 0250). This could be caused by any number of things (it's a pity we don't know the contents
of SR0 when the trap happened, that would tell us exactly what the cause was).</p>
</blockquote>
<hr>
<blockquote>
<p>[Memory management registers in the core dump] are 'prototypes', later modified for actual use by adding in
the actual address in main memory. Still trying to understand how that works - the code (in sureg() in
main.c) is kind of obscure.</p>
</blockquote>
<h3>January 10-24</h3>
<p>Further communication with Noel and the cctalk list raises some suspicion about the memory in my machine.
Though I had done spot checks and repairs on this in the past, which had been sufficient to pass most MAINDEC
diagnostics and to boot and run RT11, in fact the memory had not yet been exhaustively tested.</p>
<p>Over the course of some days, memory test codes are developed and run, and several additional failed DRAMs in
the MS11 memory system are isolated and repaired. These efforts have previously been reported in detail in
<a href="https://fritzm.github.io/unix-and-ms11.html">this blog entry</a>.</p>
<p>After these repairs, the MAINDEC MS11 memory diagnostics and KT11-C MMU diagnostics, both of which are beastly
and exhaustive, are found to pass robustly with one caveat: memory parity tests. A deep-dive into the design
and implementation of memory parity on the PDP-11/45 follows. At the end it is concluded that the machine, a
very early serial no. in its line, is in fact functioning per-design. These efforts are documented in <a href="https://fritzm.github.io/parity-handling.html">this
blog entry</a>.</p>
<p>Even though the memory system looks solid after this, the V6 Unix crash behavior remains exactly the same...</p>
<h3>January 27-29</h3>
<p>With the KT11 and memory now verified, Noel takes up the core dump again:</p>
<blockquote>
<p>The problem is that Unix does not save enough info in the core dump for me to thoroughly diagnose the MM
fault; e.g. 'ls' is a 'pure text' program/command, and the code's not included in the core dump (in normal
operation, there's no need/use for it), so I don't have the code that was running at the time, just the data
and swappable per-process kernel data - which is not all the per-process data, e.g. it doesn't include the
location of the process's code and data segments in main memory.</p>
<p>Also, I'll look at the V6 code that sets up the KT11 registers to make sure I understand what it's doing.
(The dump contains the 'prototype' for those contents, but the values are modified, by adding the actual
memory location, before being stored in the KT11.)</p>
</blockquote>
<hr>
<blockquote>
<p>I did find out that the PC at the time of the segmentation fault was 010210, which I thought looked awfully
big (so I was wondering if somehow it went crazy), but in fact the text size is 010400, so it's just inside
the pure text.</p>
</blockquote>
<p>We agree to use
<a href="https://en.wikipedia.org/wiki/Lions%27_Commentary_on_UNIX_6th_Edition,_with_Source_Code"><em>Lions</em></a> as a common
reference point for detailed discussion of the loading and running of "ls" and what may be seen in the core
dump.</p>
<h3>January 30</h3>
<p>Noel:</p>
<blockquote>
<p>So, a bit more from my examination of the swappable per-process kernel data (the 'user' structure - not sure
how much of a Unix internals person you are).</p>
<p>It gives the following for the text, data and stack sizes:</p>
<div class="highlight"><pre><span></span>tsize 000104
dsize 000035
ssize 000024
</pre></div>
<p>which seems reasonable/correct, because looking at the header for 'ls' we see:</p>
<div class="highlight"><pre><span></span>000410 010400 001050 002366 000000 000000 000000 000001
</pre></div>
<p>'0410' says it's pure text, non-split; the 010400 is the text size, which matches (those sizes above are in
'clicks', i.e. the 0100 byte quantum used in the PDP-11 memory management).</p>
<p>The data size also appears to be correct:</p>
<div class="highlight"><pre><span></span>001050 (initialized)
002366 (BSS)
------
003436
</pre></div>
<p>which again matches (round up and divide by 0100).</p>
<p>I have yet to dig around through the system sources and see what the initial stack allocation is, to see if
that's reasonable (of course, it may have been extended during execution).</p>
<p>And here are the 'prototype' segmentation register contents:</p>
<div class="highlight"><pre><span></span>UISA 000000 000020 000000 000000 000000 000000 000000 177701
UDSA 000000 000020 000000 000000 000000 000000 000000 177701
UISD 041402 016006 000000 000000 000000 000000 000000 066016
UDSD 041402 016006 000000 000000 000000 000000 000000 066016
</pre></div>
<p>Since it's not split, the D-space ones are clones of the I-space (which is what the code does - I don't
think it turns user D off and on, depending on what the process has: I'd have made context switching faster
by not having to set up the D-space registers for non-split processes, but I guess the extra overhead is
pretty minimal).</p>
<p>I have yet to check all the contents to make sure they look good, but the U?SA registers look OK; the '020'
is for the data, and that's kept contiguous with the 'user' area, so the '020' is to offset past that.</p>
<p>The PC at fault time of 010210 seems to point to the following code (assuming what was in main memory was
actually the same as the binary on the disk):</p>
<div class="highlight"><pre><span></span> mov r4,r0
jmp 10226
210: mov r5,r0
mov sp,r5
</pre></div>
<p>We don't have SSR2, which points to the failing instruction, and I forget whether the saved PC on an MMU
fault points to the failing instruction, or the next one; I'm going to assume the latter.</p>
<p>But either way, this is very puzzling, because I don't see an instruction there that could have gotten an
MMU fault! The jump is to a location within the text segment (albeit at the end), and everything else it
just register-register moves!</p>
<p>And how could the fault depend on the location in main memory?!?!</p>
<p>If you want to poke around in the core dump yourself, to verify that I haven't made a mistake, see this
page:</p>
<p><a href="http://gunkies.org/wiki/Unix_V6_dump_analysis">http://gunkies.org/wiki/Unix_V6_dump_analysis</a></p>
<p>which gives useful offsets. (The ones in the user table I verified by writing a short program which did
things like 'printf("%o", &0->u_uisa)', and the data at those locations looks like what should be there, so
I'm pretty sure that table is good. For the other one, core(5) (in the V6 man pages) gives the register
offsets (albeit in a different form), so you can check that I worked them out correctly.</p>
<p>Two things you could try to get rid of potential pattern sensitivities: before doing the 'ls', say 'sleep
360 &' first; that running in the background <em>should</em> cause the 'ls' to be loaded and run from a different
address in main memory. The other thing you could try is 'cp /bin/ls xls' and then 'xls', to load the
command from a different disk location. (Both of these assume that you don't get another fault, of course!)</p>
</blockquote>
<hr>
<blockquote>
<p>[Initial stack size] is 20. clicks, which is what it still is (024 clicks) in the process core dump, so
the stack has <em>not</em> been extended. So any MM fault you see after starting 'ls' will <em>probably</em> be the one
that's causing the process to blow out.</p>
</blockquote>
<hr>
<blockquote>
<p>I tried to re-create that exact version of the 'ls' binary, because the one in the distro is stripped, and I
wanted one with symbols to look at. I failed, because a library routine (for dates) has changed on my
machine, see here:</p>
<p><a href="http://www.chiappa.net/~jnc/tech/V6Unix.html#Issues">http://www.chiappa.net/~jnc/tech/V6Unix.html#Issues</a></p>
<p>However, I did verify that the binary for ls.o is identical to what I can produce (using the -O flag). It's
just that library routine which is different. I don't think it's worth backing out my library; I did manage
to hand-produce a stub of the symbol table for where the error is happening in the old 'ls' binary:</p>
<div class="highlight"><pre><span></span>010210T csv
010226T cret
010244T cerror
010262T _ldiv
010304T _lrem
010324T _dpadd
</pre></div>
<p>The fault does indeed seem to be happening at either the last instruction in the previous routine (ct_year,
in ctime.c), or the first of csv.</p>
<p>(I should explain that PDP-11 C uses two small chunks of code, CSV and CRET, to construct and take down
stack frames on procedure entry and exit. So on exit from <em>any</em> C procedure, the last instruction is always
an PC-relative jump to CRET.)</p>
<p>It looks like that's what's blowing up - but it apparently works with the command at a different location in
main memory! So it pretty much has to be a pattern sensitivity.</p>
<p>However, I think the KT11 does the bounds checking <em>before</em> it does the relocation - the bounds checking is
done on virtual, un-relocated addresses. So <em>that</em> part of it <em>should</em> be the same for both locations! So
here's my analysis:</p>
<p>Is it actually an indexed jump that's blowing up? I've been looking at the command binary, but that might
not be what's in main memory. Or the CPU might be looking somewhere else (because of a KT error). (If we
don't find the problem soon, we might want to put in that breakpoint so we can look in main memory and see
what inst is actually at the location where SSR2 says the failing inst was; that can rule out a whole bunch
of potential causes in one go - e.g. RK11 errors.)</p>
<p>If it is actually that jump that's failing - how? The PC hasn't been updated yet, so it can't be the fetch
of the next instruction that's failing. Is the fetch of the index word producing the MM fault?</p>
</blockquote>
<p>Fritz:</p>
<blockquote>
<p>It occurs to me that we don't even <em>really</em> know if the fault occurs from the same address every time, since
we have a core sample size of 1; I should duplicate the fail and extract another core file to compare.</p>
</blockquote>
<hr>
<blockquote>
<p>Another thing I thought I might try tonight: deposit a trap catcher in the memory mgmt trap location from
the front panel, just before issuing the 'ls' command. I can then check the PSW, PC, SP, and KT11 regs
right at the time of fault.</p>
</blockquote>
<p>Experiments begin from the front panel, and continue on into the early hours, producing:</p>
<p>Core #2:</p>
<div class="highlight"><pre><span></span>140004 000000 141710 141724
$DK
@rkunix
mem = 1035
RESTRICTED RIGHTS
Use, duplication or disclosure is subject to
restrictions stated in Contract with Western
Electric Company, Inc.
# RM CORE
# LS
MEMORY FAULT -- CORE DUMPED
# OD CORE
0000000 141552 141562 000000 000000 000000 000000 000000 000000
0000020 000000
0000060 000000 000000 000000 000001 000000 000000 063260 140076
0000100 001700 000000 000104 066112 067543 062562 000000 000000
0000120 000000 000000 000000 060221 000571 067543 062562 000000
0000140 000000 000000 000000 000000 066112 000000 000020 000000
0000160 000000 000000 000000 000000 177701 000000 000020 000000
0000200 000000 000000 000000 000000 177701 041402 016006 000000
0000220 000000 000000 000000 000000 066016 041402 016006 000000
0000240 000000 000000 000000 000000 066016 075120 075120 075120
0000260 000000
0000300 000000 000000 000000 000000 000013 010400 001050 002366
0000320 000000 000104 000035 000024 000000 141732 141742 141664
0000340 141674 000000 000000 000000 000000 000000 000000 000000
0000360 000000
0000400 000000 000000 000000 000000 000000 000000 000011 000000
0000420 000000 000000 000000 141772 000000 000000 000000 000000
0000440 000000
0001500 000000 000000 000000 000000 000000 000000 000000 003602
0001520 063260 177716 000000 141542 016070 001176 000000 003602
0001540 063260 177716 000000 141562 016070 001176 066352 030300
0001560 063260 141576 000005 003602 066352 001612 074376 044516
0001600 003602 025334 003602 000000 000443 107144 141646 000512
0001620 000000 141624 141640 000300 020276 020356 030000 003602
0001640 000007 000135 107454 141662 014314 003602 066352 004404
0001660 000000 141712 013640 074616 000000 001000 000000 000000
0001700 001000 074616 063260 066352 000013 141726 023730 066352
0001720 063260 000000 000013 141742 023502 003602 000000 177760
0001740 000013 141756 022050 000013 000000 000000 000000 000034
0001760 000444 000031 177760 000000 030351 177770 010210 170010
0002000 000001 177777 177777 023436 023436 020264 000162 000262
0002020 000262 000202 000262 000256 000210 000262 000250 000262
0002040 000262 000216 000262 000262 000262 000262 000262 000224
0002060 000170 000234 000242 000003 100000 000144 040000 000142
0002100 020000 000143 000055 000001 000400 000162 000055 000001
0002120 000200 000167 000055 000002 004000 000163 000100 000170
0002140 000055 000001 000040 000162 000055 000001 000020 000167
0002160 000055 000002 002000 000163 000010 000170 000055 000001
0002200 000004 000162 000055 000001 000002 000167 000055 000001
0002220 000001 000170 000055 000001 010000 000164 000040 020066
0002240 020106 020116 020126 020142 020152 020162 020176 020206
0002260 020216 020226 000056 062457 061564 070057 071541 073563
0002300 000144 062457 061564 063457 067562 070165 005000 071445
0002320 005072 072000 072157 066141 022440 005144 022400 062065
0002340 000040 031045 020144 022400 033055 033056 000163 026445
0002360 062066 022400 062063 022454 062063 022400 071467 020000
0002400 026445 027067 071467 022440 032055 032056 020163 020000
0002420 026445 031061 030456 071462 000040 032045 020144 022400
0002440 005163 022400 030456 071464 000012 071445 072440 071156
0002460 060545 060544 066142 005145 022400 020163 067556 020164
0002500 067546 067165 005144 000000 003750 000144 004076 000157
0002520 004070 000170 004172 000146 004210 000145 004026 000143
0002540 004044 000163 003764 000154 004226 000162 000000 000000
0002560 177774 177760 177775 177770 104404 022376 000000 104405
0002600 000000 000000 104403 000000 001000 104405 000000 000000
0002620 104421 000000 023436 104423 000000 000000 104422 000000
0002640 000000 000037 000034 000037 000036 000037 000036 000037
0002660 000037 000036 000037 000036 000037 043120 020712 020716
0002700 000001 000005 000515 000072 000457 051505 000124 042105
0002720 000124 060504 020171 067515 020156 030060 030040 035060
0002740 030060 030072 020060 034461 030060 000012 072523 046556
0002760 067157 072524 053545 062145 064124 043165 064562 060523
0003000 000164 060512 043156 061145 060515 040562 071160 060515
0003020 045171 067165 072512 040554 063565 062523 047560 072143
0003040 067516 042166 061545 000000 000000 000000 000000 000000
0003060 000000
0010060 000000 000020 000001 177770 177774 177777 071554 000000
0010100
#
</pre></div>
<p>and also:</p>
<blockquote>
<p>'db' works<br>
'cp' works<br>
'rm' works </p>
<p>'sleep 360 &' followed by 'ls' works, and then when the 'sleep' ends no longer works! So confirmation about
memory location dependence.</p>
<p>'cp /bin/ls xls' followed by 'xls' does not work (dumps core); works with 'sleep' as with 'ls' above.</p>
</blockquote>
<hr>
<blockquote>
<p>Okay, last experiment, booting up, then depositing trap catcher from the front panel into vector 250:</p>
<div class="highlight"><pre><span></span><span class="mi">000250</span><span class="o">:</span> <span class="mi">000252</span>
<span class="mi">000252</span><span class="o">:</span> <span class="mi">000000</span>
</pre></div>
<p>...then issuing the 'ls' seems to catch it. I can then examine registers and memory etc. from the front
panel. This is a quick and easy repro. I went ahead and dumped a few of the KT11 registers (but its late,
so I can't guarantee I didn't slip up -- should try this again when I'm fresh):</p>
<div class="highlight"><pre><span></span><span class="n">SR0</span><span class="o">:</span> <span class="mi">040143</span> <span class="o">(</span><span class="n">ah</span><span class="o">!</span> <span class="n">page</span> <span class="n">length</span> <span class="n">fault</span><span class="o">,</span> <span class="n">user</span> <span class="n">I</span><span class="o">-</span><span class="n">space</span><span class="o">,</span> <span class="n">page</span> <span class="mi">1</span><span class="o">)</span>
<span class="n">SR1</span><span class="o">:</span> <span class="mi">000000</span> <span class="o">(</span><span class="n">no</span> <span class="n">auto</span> <span class="n">inc</span><span class="o">/</span><span class="n">dec</span> <span class="n">to</span> <span class="n">clean</span> <span class="n">up</span><span class="o">)</span>
<span class="n">SR2</span><span class="o">:</span> <span class="mi">010210</span> <span class="o">(</span><span class="n">virtual</span> <span class="n">PC</span><span class="o">,</span> <span class="n">agrees</span> <span class="k">with</span> <span class="n">your</span> <span class="n">deduction</span> <span class="n">from</span> <span class="n">core</span> <span class="n">dump</span><span class="o">)</span>
<span class="n">SR3</span><span class="o">:</span> <span class="mi">000000</span> <span class="o">(</span><span class="n">that</span><span class="s1">'s odd -- shouldn'</span><span class="n">t</span> <span class="n">split</span> <span class="n">I</span><span class="o">/</span><span class="n">D</span> <span class="n">be</span> <span class="n">enabled</span><span class="o">?)</span>
<span class="n">UIPDR</span><span class="o">:</span> <span class="mi">041402</span> <span class="mi">016006</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">066116</span>
<span class="n">UIPAR</span><span class="o">:</span> <span class="mi">001614</span> <span class="mi">001760</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span>
<span class="n">UDPDR</span><span class="o">:</span> <span class="mi">010501</span> <span class="mi">057517</span> <span class="mi">077717</span> <span class="mi">077717</span> <span class="mi">037611</span> <span class="mi">067616</span> <span class="mi">076300</span> <span class="mi">064317</span>
<span class="n">UDPAR</span><span class="o">:</span> <span class="mi">002417</span> <span class="mi">002564</span> <span class="mi">007777</span> <span class="mi">007766</span> <span class="mi">005635</span> <span class="mi">005656</span> <span class="mi">007777</span> <span class="n">oops</span>
</pre></div>
<p>...where "oops" means I thought I was done scribbling all these down, and turned off the machine. Did I
mention it's late? :-)</p>
</blockquote>
<p>[Note: It <em>was</em> late, and there is an error with UIPAR7 in this transcription. This will be the source of
some uncertainty until corrected on February 2.]</p>
<h3>January 31</h3>
<p>Noel:</p>
<blockquote>
<blockquote>
<p>'sleep 360 &' followed by 'ls' works, and then when the 'sleep' ends no longer works! So confirmation about
memory location dependence.</p>
</blockquote>
<p>Yeah, that's a really important data-point. The fact that it is physical location dependent really does tend
to implicate the KT11; I think the KB11 mostly only knows/has virtual addresses? (So I probably shouldn't
bang my head trying to think of failure modes in the KB11?) If you have the source for its diag, you might
try looking through it, looking for things it doesn't try...</p>
<p>Although I suppose it could be a location-dependent issue with the RK11. I should explain how to find, and
examine the pure-text for the 'ls' command; if you halt the CPU on the trap again, look at UISA0, and that
should give you the 'click' where the text starts; at that point I'd probably examine every 256th (block
size) word and we can compare them to the original to make sure the in-core copy is OK.</p>
</blockquote>
<hr>
<blockquote>
<blockquote>
<div class="highlight"><pre><span></span><span class="n">SR0</span><span class="o">:</span> <span class="mi">040143</span> <span class="o">(</span><span class="n">ah</span><span class="o">!</span> <span class="n">page</span> <span class="n">length</span> <span class="n">fault</span><span class="o">,</span> <span class="n">user</span> <span class="n">I</span><span class="o">-</span><span class="n">space</span><span class="o">,</span> <span class="n">page</span> <span class="mi">1</span><span class="o">)</span>
<span class="n">SR2</span><span class="o">:</span> <span class="mi">010210</span> <span class="o">(</span><span class="n">virtual</span> <span class="n">PC</span><span class="o">,</span> <span class="n">agrees</span> <span class="k">with</span> <span class="n">your</span> <span class="n">deduction</span> <span class="n">from</span> <span class="n">core</span> <span class="n">dump</span>
</pre></div>
</blockquote>
<p>If it's really 010210, I wonder how it could be a fault on page 1; each page (segment, really) of virtual
address space is 020000 long, so that address is well inside page 0?</p>
<p>Unless it has fetched some other instruction, due to some other error, one which does try and do something
on page 1... Might want to try looking at a few instructions around 010210 when you try this again, see
what's actually there. Let's see, code starts at 0161400 in real memory (per UIPAR0 below), so 010210 is at
0171610... Maybe dump a few words from 171600 on?</p>
<blockquote>
<div class="highlight"><pre><span></span><span class="n">SR3</span><span class="o">:</span> <span class="mi">000000</span> <span class="o">(</span><span class="n">that</span><span class="s1">'s odd -- shouldn'</span><span class="n">t</span> <span class="n">split</span> <span class="n">I</span><span class="o">/</span><span class="n">D</span> <span class="n">be</span> <span class="n">enabled</span><span class="o">?)</span>
</pre></div>
</blockquote>
<p>No; you're running binary for a /40 system, so no split I/D. So also, all the UDPARs and UDPDRs will contain
junk.</p>
<blockquote>
<div class="highlight"><pre><span></span><span class="n">UIPAR</span><span class="o">:</span> <span class="mi">001614</span> <span class="mi">001760</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span>
</pre></div>
</blockquote>
<p>?? UIPAR7 looks wrong; if the data is really at 01760, I think the stack should be above that in real memory
- but I might be wrong, I will check.</p>
<p>If it is wrong, did something cause the wrong value to be stored there (e.g. an error in the execution of
lines 1750/1751 in Lions); or was the prototype calculated wrong (around line 1704) - but I think the
prototypes looked correct in the process' core dump, but I will check them; or did the hardware flake out,
and e.g. copy a later store (the code fills them from the top down) up to UISA7?</p>
<p>To check out the latter, maybe a bespoke tiny program, toggled in, to try storing the 'correct' data in the
UISPARs, in the exact way that the Unix code does it, and then look and see what's in there?</p>
<p>This might also correlate to the strange stuff I saw in the process' user-mode stack, in the dump - I will
go back and look at that now.</p>
<p>If you do this again, please add KISA6 and KISD6 to the registers to dump (you can skip UDS*), so we can see
what it thinks is going on with the per-process swappable data, which should be just below the process'
user-mode data, in terms of real memory.</p>
</blockquote>
<hr>
<blockquote>
<p>Yes, the stack is directly above the user data, which is directly above the swappable per-process data (user
struct, and kernel stack). But the address math for stack segments in the KT11 is weird (see below).</p>
<p>I <em>think</em> the prototypes:</p>
<div class="highlight"><pre><span></span>UISA 000000 000020 000000 000000 000000 000000 000000 177701
UISD 041402 016006 000000 000000 000000 000000 000000 066016
</pre></div>
<p>are right, but the negative direction of the stack is making my head hurt (and the UISA7 you recorded from
the hardware might be right after all - but then the UISA0 might be wrong - it's suspicious, but not
impossible, that they are the same value).</p>
<p>If the SPPD is at physical xxx, the user data will be at xxx+20 (in clicks, as above) through xxx+20+34
(below), and then the stack above that. Per the SPPD:</p>
<div class="highlight"><pre><span></span>tsize 000104
dsize 000035
ssize 000024
</pre></div>
<p>the stack should then run from xxx+20+35 to xxx+20+35+23. The way the MM hardware works for stack segment,
the 'base' is where the first click would be if the segment were a full 0200 clicks. (Per the example in the
/45 proc handbook; for a 3-click stack running from physical 0331500 to 0331776, the PAR would contain
03120, i.e. segment base at 0312000.)</p>
<p>So let me do the math (please check to see if I'm confused :-); base of user data is at 0176000 (per UISA1
contents), runs to 0201476 (i.e. plus 03500); the stack would run from 0201500 to 0204076 (i.e. plus 02400).
So the stack segment 'base' would be 020000 below the next word, or 0164100.</p>
<p>(My head hurts too much to work out if the 177701 of the prototype is right; basically, the location of the
SPPD in clicks would be 01740 (I <em>think</em> - 01760 - 020), and that plus 177701 should give us 01641.)</p>
<p>But, anyway, I'm fairly sure that 01614 is <em>not</em> right for UISA7 (unless it really was 1641 and you inverted
the digits because it looked so close).</p>
<p>Having KISA6 would help since it would give us a cross-check on the value of UISA1.....</p>
</blockquote>
<hr>
<blockquote>
<p>So, according to the process core dump, these are the register contents at the time of the fault:</p>
<div class="highlight"><pre><span></span>R0 177770
R1 0
R2 0
R3 0
R4 34
R5 444
SP 177760
PC 010210
PS 170010
</pre></div>
<p>Now, PDP-11 uses R5 for a frame pointer, set up thus:</p>
<div class="highlight"><pre><span></span> jsr r5,csv (first instruction in every C routine)
csv:
mov r5,r0
mov sp,r5
mov r4,-(sp)
mov r3,-(sp)
mov r2,-(sp)
tst -(sp)
jmp (r0)
</pre></div>
<p>on subroutine entry (the 'jsr r5, csv' pushes the old R5 contents, and temporarily saves the return PC - to
just after the call to CSV, not to the sunroutine which called this one, that's further down - in R5). So,
except for the first two instructions of CSV, R5 <em>always</em> contains an old SP.</p>
<p>Now look at the R5 from the crash. That's not an old SP. Something has already gone seriously wrong by this
point - actually, likely the process has just started to run the newly-loaded command code (see below), and
hasn't even set up its first stack frame yet.</p>
<p>Now look at the top of the stack, as recorded in the process' core dump:</p>
<div class="highlight"><pre><span></span><span class="mi">0010060</span><span class="o">:</span> <span class="mi">000000</span> <span class="mi">000020</span> <span class="mi">000001</span> <span class="mi">177770</span> <span class="mi">177774</span> <span class="mi">177777</span> <span class="mi">071554</span> <span class="mi">000000</span>
</pre></div>
<p>And that's <em>it</em>; the rest if all 0's! (The base address does seem to correspond; with:</p>
<div class="highlight"><pre><span></span>dsize 000035
ssize 000024
</pre></div>
<p>and the SPPD being 020 clicks, that puts the top of the stack at 0101 clicks, or 010100, and the last
location there is 010076.</p>
<p>The core dump routine, core() writes the user data out in two transfers (Lions 4113-4124), one for the SPPD,
one for the user's data+stack. So we probably got the SPPD OK, but the rest - who knows?</p>
<p>It does call estabur(), which sets up the prototype MM register contents, and then writes them into the
actual registers, so the prototypes in the process' core dump that I was looking at before have already been
overwritten. :-(But estabur() then called sureg (Lions 1724) so hopefully the MM regs wound up pointing to
the actual memory being used for the stack - but who knows?</p>
<p>Anyway, looking at the contents, the top of the stack does look vaguely like what it should be when the
command <em>starts</em> executing, after the exec() call; the SP is even reasonable; it points to that 0 at offset
010060.</p>
<p>The 020 is the return point for the call to _main (see below; that 'jsr pc,_main' ends at 016); the '1' is
probably 'nargs' (see Exec(II) in the V6 Manual), the '0177770' is argv, '177774' is argv[0], 177777 is
argv[1] (end of list marker), and '071554' is 'ls' (the command name, by convention the first argument).</p>
<p>R0 contains what looks like an old SP, although I suppose that could have been
left over from the assembler startup:</p>
<div class="highlight"><pre><span></span><span class="n">start</span><span class="o">:</span>
<span class="n">setd</span>
<span class="n">mov</span> <span class="n">sp</span><span class="o">,</span><span class="n">r0</span>
<span class="n">mov</span> <span class="o">(</span><span class="n">r0</span><span class="o">),-(</span><span class="n">sp</span><span class="o">)</span>
<span class="n">tst</span> <span class="o">(</span><span class="n">r0</span><span class="o">)+</span>
<span class="n">mov</span> <span class="n">r0</span><span class="o">,</span><span class="mi">2</span><span class="o">(</span><span class="n">sp</span><span class="o">)</span>
<span class="n">jsr</span> <span class="n">pc</span><span class="o">,</span><span class="n">_main</span>
</pre></div>
<p>but clearly the attempt to execute the first instruction in CSV blew up. And where did the '444' in R5 come
from? The call to CSV is at 030?</p>
</blockquote>
<h3>February 1</h3>
<p>Noel, regarding the second core file:</p>
<blockquote>
<p>I took a quick look, and everything 'important' seems to be identical: the registers, PC, etc at the time of
the trap (including that mysterious '444' in R5); the prototype MM registers; the user's stack (looking
again like the command just started.</p>
</blockquote>
<hr>
<blockquote>
<blockquote>
<p>I went ahead and dumped a few of the KT11 registers</p>
<div class="highlight"><pre><span></span><span class="n">UIPDR</span><span class="o">:</span> <span class="mi">041402</span> <span class="mi">016006</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">066116</span>
</pre></div>
</blockquote>
<p>Oh, BTW, I checked, and these match the prototype values in the user struct.</p>
</blockquote>
<h3>February 2-3</h3>
<p>A tip from Noel:</p>
<blockquote>
<p>Something stirred this in my memory: the best quick overview of the internals of the Bell PDP-11 Unixes is
K. Thompson, "UNIX Implementation", available here:</p>
<p><a href="https://users.soe.ucsc.edu/~sbrandt/221/Papers/History/thompson-bstj78.pdf">https://users.soe.ucsc.edu/~sbrandt/221/Papers/History/thompson-bstj78.pdf</a></p>
<p>if you want to know more about what the insides are like.</p>
</blockquote>
<p>Fritz:</p>
<blockquote>
<p>Okay, here's the latest, done with some care:</p>
<div class="highlight"><pre><span></span><span class="n">UISD</span><span class="o">:</span> <span class="mi">041402</span> <span class="mi">016006</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">066116</span>
<span class="n">UISA</span><span class="o">:</span> <span class="mi">001614</span> <span class="mi">001760</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001641</span>
<span class="n">KISD</span><span class="o">:</span> <span class="mi">077406</span> <span class="mi">077406</span> <span class="mi">077406</span> <span class="mi">077506</span> <span class="mi">077506</span> <span class="mi">077406</span> <span class="mi">007506</span> <span class="mi">077506</span>
<span class="n">KISA</span><span class="o">:</span> <span class="mi">000000</span> <span class="mi">000200</span> <span class="mi">000400</span> <span class="mi">000600</span> <span class="mi">001000</span> <span class="mi">001200</span> <span class="mi">001740</span> <span class="mi">007600</span>
<span class="n">SRs</span><span class="o">:</span> <span class="mi">040143</span> <span class="mi">000000</span> <span class="mi">010210</span> <span class="mi">000000</span>
<span class="mi">171600</span><span class="o">:</span> <span class="mi">016162</span> <span class="mi">004767</span> <span class="mi">000224</span> <span class="mi">000414</span> <span class="mi">006700</span> <span class="mi">006152</span> <span class="mi">006702</span> <span class="mi">006144</span>
</pre></div>
</blockquote>
<p>[Note: this fixes the previous late-night transcription error with UISA7...]</p>
<p>Noel:</p>
<blockquote>
<blockquote>
<div class="highlight"><pre><span></span><span class="n">UISD</span><span class="o">:</span> <span class="mi">041402</span> <span class="mi">016006</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">000000</span> <span class="mi">066116</span>
<span class="n">UISA</span><span class="o">:</span> <span class="mi">001614</span> <span class="mi">001760</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001614</span> <span class="mi">001641</span>
</pre></div>
</blockquote>
<p>So, 'good news' is these are the same except for UISA7, for which as I suspected, it looks like the digits
were transposed. But the new value is exactly the one I calculated.</p>
<p>'Bad news' is that takes out what I was thinking might be a potential cause, which was UPAR's getting
trashed by hardware failure. So more hard work ahead (see below).</p>
<blockquote>
<div class="highlight"><pre><span></span><span class="n">KISD</span><span class="o">:</span> <span class="mi">077406</span> <span class="mi">077406</span> <span class="mi">077406</span> <span class="mi">077506</span> <span class="mi">077506</span> <span class="mi">077406</span> <span class="mi">007506</span> <span class="mi">077506</span>
<span class="n">KISA</span><span class="o">:</span> <span class="mi">000000</span> <span class="mi">000200</span> <span class="mi">000400</span> <span class="mi">000600</span> <span class="mi">001000</span> <span class="mi">001200</span> <span class="mi">001740</span> <span class="mi">007600</span>
</pre></div>
</blockquote>
<p>Those all look OK: KISD6 show the segment length as 020 (017 being the last valid click), which is right,
and KISA6 is 01740, so with the user area and kernel stack being 20 clicks, that makes the start of the user
data 01760, which is what UISA1 contains.</p>
<blockquote>
<div class="highlight"><pre><span></span><span class="n">SRs</span><span class="o">:</span> <span class="mi">040143</span> <span class="mi">000000</span> <span class="mi">010210</span> <span class="mi">000000</span>
</pre></div>
</blockquote>
<p>OK, same failing location as before (010210); SSR0 shows:</p>
<div class="highlight"><pre><span></span>Abort - page length error
User mode
Page 1
</pre></div>
<p>which is the same as last time.</p>
<div class="highlight"><pre><span></span><span class="mi">171600</span><span class="o">:</span> <span class="mi">016162</span> <span class="mi">004767</span> <span class="mi">000224</span> <span class="mi">000414</span> <span class="mi">006700</span> <span class="mi">006152</span> <span class="mi">006702</span> <span class="mi">006144</span>
</pre></div>
<p>Let me just re-check the math here: text base is 0161400, plus a PC of 010210, gives us 0171610, which is
right in the middle there - thanks!</p>
<p>That does not, alas, look anything <em>at all</em> like what's <em>supposed</em> to be there, which is:</p>
<div class="highlight"><pre><span></span><span class="mi">010200</span><span class="o">:</span> <span class="mi">110024</span>
<span class="mi">010400</span> <span class="n">mov</span> <span class="n">r4</span><span class="o">,</span><span class="n">r0</span>
<span class="mi">000167</span> <span class="n">jmp</span> <span class="mi">10226</span> <span class="o">(</span><span class="n">cret</span><span class="o">)</span>
<span class="mi">000016</span>
<span class="mi">010500</span> <span class="n">mov</span> <span class="n">r5</span><span class="o">,</span><span class="n">r0</span> <span class="o">(</span><span class="n">start</span> <span class="n">of</span> <span class="n">CSV</span><span class="o">)</span>
<span class="mi">010605</span> <span class="n">mov</span> <span class="n">sp</span><span class="o">,</span><span class="n">r5</span>
<span class="mi">010446</span> <span class="n">mov</span> <span class="n">r4</span><span class="o">,-(</span><span class="n">sp</span><span class="o">)</span>
<span class="mi">010346</span> <span class="n">mov</span> <span class="n">r3</span><span class="o">,-(</span><span class="n">sp</span><span class="o">)</span>
</pre></div>
<p>So maybe the RK11 went berserk? But maybe not...</p>
<p>The 4767 is a 'jsr pc, xxx' which is typical C compiler emission, but the rest looks like rubbish - 6700 is
a SXT R0, for instance.</p>
<p>What's actually there at 010210 (virtual) still doesn't explain the MM trap we got; 'SXT R0' should have
executed OK, no matter what? Confoozled...</p>
<p>What's also odd is how it got here; it's almost like the first few instructions:</p>
<div class="highlight"><pre><span></span><span class="n">start</span><span class="o">:</span>
<span class="n">setd</span>
<span class="n">mov</span> <span class="n">sp</span><span class="o">,</span><span class="n">r0</span>
<span class="n">mov</span> <span class="o">(</span><span class="n">r0</span><span class="o">),-(</span><span class="n">sp</span><span class="o">)</span>
<span class="n">tst</span> <span class="o">(</span><span class="n">r0</span><span class="o">)+</span>
<span class="n">mov</span> <span class="n">r0</span><span class="o">,</span><span class="mi">2</span><span class="o">(</span><span class="n">sp</span><span class="o">)</span>
<span class="n">jsr</span> <span class="n">pc</span><span class="o">,</span><span class="n">_main</span>
<span class="n">_main</span><span class="o">:</span>
<span class="n">jsr</span> <span class="n">r5</span><span class="o">,</span><span class="n">csv</span>
</pre></div>
<p>executed OK, and then it tried to go off to csv, only there's trash there? And what's with the 0444 in R5?
That should be 034, the return from that last JSR.</p>
<p>I'm going to go ponder all this. One more thing you could try is do this all again, and write down the first
couple of instructions at the start of the text segment (UISA0 = 01614, so 0161400 on for a few words), so
we can see if <em>that</em> looks OK.</p>
<p>If so, it will look like the command got read in off the disk wrong - since it's not coming from swap (it's
just starting), it's coming out of the file system wrong. Why will be a good question.</p>
<p>And I still don't understand the 'segment 1' fault, and the R5 contents - so many things going wrong all at
once, for reasons that make no sense... I wonder if there's a noise glitch hitting several things all at the
same time?</p>
</blockquote>
<p>Fritz:</p>
<blockquote>
<p>I read a bit through the KT11 maintenance manual you sent yesterday, to refresh myself on it a bit (thanks
for that!). I realized I almost always use my console in "PROG PHY" or "CONS PHY" mode; but using "USER I"
and "KERNEL I" I may be able to verify quickly that the KT11 is thinking VA:010210 -> PA:171610.</p>
<p>When I set this up to try later, I'll examine that start of the text segment at 161400 as well, per your
recommend.</p>
</blockquote>
<h3>February 4</h3>
<p>Noel sends up a flare on cctalk in the the early AM, summarizing the problem and experiments to date.
Suggestions start to flow in. Some have already been tried or can be ruled out. Some others:</p>
<ul>
<li>
<p>Bob Smith: "I keep wondering about the psu...". This gets some agreement from the list, and a few
interesting/relevant anecdotes are relayed. Paul Koning:</p>
<blockquote>
<p>In RSTS development we once ran into DMC-11s not working reliably. The field service tech knew exactly
what to look for, and started checking all the supply voltages. The spec says allowed tolerances are
+/- 5%. He knew the reality for correct operation was -0%, +5%, so he tweaked all the supplies to read
a hair above nominal.</p>
</blockquote>
<p>Warner Losh:</p>
<blockquote>
<p>I recall our PDP-11 tech tweaking +5V from 5.05V to 4.95V and back again to demonstrate that tiny
differences matter a lot on one of the cranky 11/23+''s we had after I made a particularly unhelpful
teenage smart ass remark... The 11/23+ wouldn't boot at the slightly lower than full voltage.</p>
</blockquote>
<p>It is worth noting that in both of these cases, a slight undervoltage proved problematic...</p>
</li>
<li>
<p>Paul Koning suggests a potential KT11 failure mode:</p>
<blockquote>
<p>Another possibility occurs to me: bad bits in the MMU (UISAR0 register if I remember correctly). Bad
memory is likely to show up with a few bits wrong; if UISAR0 has a stuck bit so the "plain" case maps
incorrectly you'd expect to come up with execution that looks nothing at all like what was intended.</p>
</blockquote>
<p>Noel provides a short diagnostic (apparently, straight from his mind to machine code; props! :-) to check
read-after-write on UISA* so we can rule this out:</p>
<div class="highlight"><pre><span></span><span class="mi">1000</span><span class="o">:</span> <span class="mi">12706</span> <span class="o">/</span> <span class="n">Put</span> <span class="n">stack</span> <span class="n">at</span> <span class="mi">0700</span>
<span class="mi">700</span>
<span class="mi">12701</span> <span class="o">/</span> <span class="n">Load</span> <span class="n">UISA0</span> <span class="n">address</span> <span class="k">in</span> <span class="n">R1</span>
<span class="mi">177640</span>
<span class="mi">5000</span> <span class="o">/</span> <span class="n">Start</span> <span class="n">testing</span> <span class="n">at</span> <span class="mi">0</span>
<span class="mi">10011</span> <span class="o">/</span> <span class="n">Store</span> <span class="n">it</span>
<span class="mi">20011</span> <span class="o">/</span> <span class="n">Check</span> <span class="n">it</span>
<span class="mi">1401</span> <span class="o">/</span> <span class="n">Skip</span> <span class="k">if</span> <span class="n">match</span>
<span class="mi">0</span> <span class="o">/</span> <span class="n">Halt</span> <span class="n">here</span> <span class="n">on</span> <span class="n">error</span>
<span class="mi">5200</span> <span class="o">/</span> <span class="n">Next</span> <span class="n">value</span>
<span class="mi">20027</span> <span class="o">/</span> <span class="mi">07777</span> <span class="n">or</span> <span class="n">less</span><span class="o">?</span>
<span class="mi">7777</span>
<span class="mi">101770</span> <span class="o">/</span> <span class="n">Go</span> <span class="n">around</span>
<span class="mi">5721</span> <span class="o">/</span> <span class="n">Next</span> <span class="n">register</span>
<span class="mi">20127</span> <span class="o">/</span> <span class="n">Done</span> <span class="n">them</span> <span class="n">all</span><span class="o">?</span>
<span class="mi">177660</span>
<span class="mi">101401</span> <span class="o">/</span> <span class="n">Skip</span> <span class="k">if</span> <span class="n">not</span>
<span class="mi">0</span> <span class="o">/</span> <span class="n">Halt</span> <span class="n">here</span> <span class="n">when</span> <span class="n">done</span>
<span class="mi">137</span> <span class="o">/</span> <span class="n">Go</span> <span class="n">back</span>
<span class="mi">1010</span>
</pre></div>
<p>This is toggled in and passes on the machine.</p>
</li>
<li>
<p>Mattis Lind:</p>
<blockquote>
<p>Would it be any difference if you run the machine at full speed or lower speed or even single step past
this instruction? ... The TIG module has a separate non crystal controlled oscillator which one could
tune for marginal checking.</p>
</blockquote>
<p>Ah, yes, the margining clock! Always worth a check, and very easy to use with if you have a KM11 handy.
A variety of clock speeds are tried, but the behavior remains the same.</p>
</li>
<li>
<p>Brent Hilpert:</p>
<blockquote>
<p>For consideration, what about the refresh circuitry of the memory board?</p>
<p>Mem diagnostics, unless they explicitly account for it, may not show up problems with memory refresh if
the loop times are short enough to effectively substitute as refresh cycles, while they could show up
later in real-world use with arbitrary time between accesses.</p>
<p>Refresh on some early boards/systems was asynchronously timed by monostables or onboard oscillators
which can drift or fail on the margin/slope. (I don't know what DEC's design policy was for DRAM
refresh). It might also explain why a number of 4116s were (apparently) failing earlier in the efforts
(if I recall the discussion correctly), replacing them might have just replaced them with 'slightly
better' chips, i.e. with a slightly longer refresh tolerance.</p>
</blockquote>
<p>This one also gets some follow-up. The schematics are consulted, and the MS11-L refresh is seen, indeed,
to be driven by a simple free-running 555. Further from Brent:</p>
<blockquote>
<p>4116 datasheet specs 2mS, my calcs give a refresh period of 1.5mS, the 14.5uS from the manual would give
1.86 mS, 7% shy of 2. The schematic specs 1% resistors, and the parts list does appear to spec a
high-tolerance "1%200PPM" cap.</p>
<p>Although there are the internal voltage divider Rs in the 555 which are also critical for the timing and
everything is 40+ years old...</p>
</blockquote>
<p>The actual MS11 in use measures out on my 'scope at 15.2us. From Brent:</p>
<blockquote>
<p>15.2uS gives a 1.95mS refresh, so it's awfully close to the 2mS spec, but still within. The datasheet I
was looking at doesn't seem to give any spec for tolerance on the refresh so one would guess there's a
safety margin built into the 2mS spec.</p>
</blockquote>
</li>
</ul>
<p>Fritz:</p>
<blockquote>
<blockquote>
<div class="highlight"><pre><span></span>R0 177770
R1 0
R2 0
R3 0
R4 34
R5 444
SP 177760
PC 010210
060: 000000 000020 000001 177770 177774 177777 071554 000000
</pre></div>
</blockquote>
<p>Okay, I've had a bit of time in front of the machine to repro this and take a look. What I actually see is:</p>
<div class="highlight"><pre><span></span>R0 177770
R1 0
R2 0
R3 0
R4 0
R5 34
R6 141774
PC 000254
</pre></div>
<p>(remember, for the last, this will have been after taking a trap to 250, where I have the usual "BR .+2;
HALT" catcher installed)</p>
<p>Also, memory at 060 (PA:164060) is all zeros as far as the eye can see...</p>
</blockquote>
<p>Then, a big discovery from Noel:</p>
<blockquote>
<p>Argh. (Very red face!)</p>
<p>I worked out the trap stack layout by looking at m40.s and trap.c, and totally forgot about the return PC
(that's the 0444) from the call to trap():</p>
<div class="highlight"><pre><span></span>0001740 000013 141756 022050 000013 000000 000000 000000 000034
0001760 000444 000031 177760 000000 030351 177770 010210 170010
</pre></div>
<p>I clearly should have looked at core(V) in the V6 manual!</p>
<p>The R6 you have recorded is correct for just after the trap; that's the kernel mode SP, which points to the
top of the kernel stack, in segment 6 (in the swappable per-process kernel area, which runs from
140000-1776).</p>
<p>So there is no R5 mystery, I was just confused. Back to the other two!</p>
</blockquote>
<p>But meanwhile, back in front of the actual machine:</p>
<blockquote>
<p>Seeing some quite strange stuff now, after the crash, flipping between "CONS PHY" and "PROG PHY"...</p>
<p>Bits 6-12 are not acting as I would expect, almost as if the KT11 ALU is doing an incorrect operation
(subtraction rather than add!) </p>
<p>I see these are 74S181 bit slice ALUs, and function code should be hardwired to "A+B"... So that brings us
back around to really checking those supply voltages...</p>
</blockquote>
<p>It turns out the +5V supplies were, in fact, slightly low (about 4.9 or so). Trimmed these up, and the the
observed problems with bits 6-12 receded, though the "ls" crash remained exactly the same. It would appear,
though, consistent with remarks above, that the machine has very little undervoltage tolerance on +5V --
certainly less than the documented -5%.</p>
<p>How long had the machine been in this condition, and what else might have been affected? It could not have
been for very long, since the previously run KT11 diagnostics would certainly have failed. But the situation
was spooky, and instilled some uncertainty about other data that had recently been retrieved via the front
panel...</p>
<h3>February 5</h3>
<p>Noel clears away one additional address calculation error:</p>
<blockquote>
<p>So I had to grub a bit to find this, but here's what I said:</p>
<blockquote>
<p>With KISA7 at 001641, 0164100 should be the first location after the stack, so 0164060 and up would be
good. They <em>should</em> be:</p>
<div class="highlight"><pre><span></span><span class="mi">060</span><span class="o">:</span> <span class="mi">000000</span> <span class="mi">000020</span> <span class="mi">000001</span> <span class="mi">177770</span> <span class="mi">177774</span> <span class="mi">177777</span> <span class="mi">071554</span> <span class="mi">000000</span>
</pre></div>
</blockquote>
<p>and I have no idea how I screwed the address there up that that badly. The data I'm showing there is the top
(address-wise; i.e. bottom, push-pop-wise) of the user stack, and I think it's correct. However, it's UISA7
which contains 01641, and that's the 'bottom' of that segment. I had previously done the math correctly:</p>
<blockquote>
<p>base of user data is at 0176000 (per UISA1 contents), runs to 0201476 (i.e. plus 03500); the stack would
run from 0201500 to 0204076 (i.e. plus 02400). So the stack segment 'base' would be 020000 below the next
word, or 0164100.</p>
</blockquote>
<p>So physical 0164060 is just in the middle of nowhere; it's somewhere in the middle of the text (which starts
at physical 0161400).</p>
<p>If you could try this again, and check the top of the <em>actual</em> user stack (which will be at physical
0204060-0204076), I'd really appreciate it. I do expect it to be correct: the process core dump has it
correct (as shown by the analysis of argc, argv, etc).</p>
</blockquote>
<p>And I am able to get some consistent, correct, data after the power-supply tune-up:</p>
<blockquote>
<p>Okay, latest numbers for you!</p>
<p>Stack, confirmed:</p>
<div class="highlight"><pre><span></span><span class="n">PA</span><span class="o">:</span><span class="mi">204060</span><span class="o">:</span> <span class="mi">000000</span> <span class="mi">000020</span> <span class="mi">000001</span> <span class="mi">177770</span> <span class="mi">177774</span> <span class="mi">777777</span> <span class="mi">071554</span> <span class="mi">000000</span>
</pre></div>
<p>Text; as I had feared, a few dropped bits there! Went ahead and grabbed you eight extra words while I was
there:</p>
<div class="highlight"><pre><span></span><span class="n">PA</span><span class="o">:</span><span class="mi">171600</span><span class="o">:</span> <span class="mi">016162</span> <span class="mi">004767</span> <span class="mi">000224</span> <span class="mi">000414</span> <span class="mi">016700</span> <span class="mi">016152</span> <span class="mi">016702</span> <span class="mi">016144</span>
<span class="n">PA</span><span class="o">:</span><span class="mi">171620</span><span class="o">:</span> <span class="mi">004767</span> <span class="mi">000206</span> <span class="mi">000405</span> <span class="mi">012404</span> <span class="mi">012467</span> <span class="mi">016124</span> <span class="mi">000167</span> <span class="mi">177346</span>
</pre></div>
<p>In disassembly from 171602, this yields:</p>
<div class="highlight"><pre><span></span><span class="mi">171602</span><span class="o">:</span> <span class="n">JSR</span> <span class="n">PC</span><span class="o">,</span><span class="mi">172032</span>
<span class="mi">171606</span><span class="o">:</span> <span class="n">BR</span> <span class="mi">171640</span>
<span class="mi">171610</span><span class="o">:</span> <span class="n">MOV</span> <span class="mi">7766</span><span class="o">,</span><span class="n">R0</span>
<span class="mi">171614</span><span class="o">:</span> <span class="n">MOV</span> <span class="mi">7764</span><span class="o">,</span><span class="n">R2</span>
<span class="mi">171620</span><span class="o">:</span> <span class="n">JSR</span> <span class="n">PC</span><span class="o">,</span><span class="mi">172032</span>
<span class="mi">171624</span><span class="o">:</span> <span class="n">BR</span> <span class="mi">171640</span>
<span class="mi">171626</span><span class="o">:</span> <span class="n">MOV</span> <span class="o">(</span><span class="n">R4</span><span class="o">)+,</span><span class="n">R4</span>
<span class="mi">171630</span><span class="o">:</span> <span class="n">MOV</span> <span class="o">(</span><span class="n">R4</span><span class="o">)+,</span><span class="mi">7760</span>
<span class="mi">171634</span><span class="o">:</span> <span class="n">JMP</span> <span class="mi">171206</span>
</pre></div>
<p>...which looks at least like feasible code, if not the code we are expecting?</p>
</blockquote>
<p>Last, a note on procedure for using the front panel to verify KT11 address mappings:</p>
<blockquote>