;;; -*- Mode: LISP; Syntax: Common-lisp; Package: CMI; Base: 10; Patch-File: Yes -*-

(in-package 'cmi)

(defucmacro fast-increment-and-load-ta (source count)
  ;; Increment the source and load the result directly into TA.  Clears  the TA pointer on
  ;; entry. 
 `(ucblock ()
    (ui (mop-maddr ,source)		; Use a safe maddr
	(sop :float-bus :transposer-a
	     :float-bus-pointer-control :post-clear
	     :float-bus-direction :write))
    (ui	(mop::loadb (xor :a t) :condition-invert 1 :condition-flag :zero-flag))
    (ui (mop-maddr (,source ++))
	(mop::loada (and :a t)))
    (ui (mop-maddr (constant (eval *sr-scratch-slice*)))
	(store-to-ta :write-flag :carry-flag)
	(clk-cntl 4-tick))
    (ui (mop-maddr ,source)		; Use a safe maddr
	(mop::loadb (xor :a :f) :condition-invert 1 :condition-flag :zero-flag)
	(seq push t ,(- count 3)))
    (ucblock-ns
      (ui (mop-maddr (,source ++))
	  (mop::loada (and :a :f) :read-flag :carry-flag))
      (ui (mop-maddr (constant (eval *sr-scratch-slice*)))
	  (store-to-ta :write-flag :carry-flag)
	  (clk-cntl 4-tick)
	  (seq rfct)))
    (ui (alu-simp (- ,source (constant ,(- count 1))) (,source $exp) (y ,source))
	(ld-maddr maddr)
	(mop::loada (and :a :f) :read-flag :carry-flag))
    (ui (mop-maddr (constant (eval *sr-scratch-slice*)))
	(store-to-ta :write-flag :carry-flag))))

(def-sprint-delivery-ucode sprint-queue-index-processor ()

  ;; 1: get queue-index (aref using index from router)
  ;; 2: increment queue-index (by 1, lls-up)
  ;; 3: store queue-index back (aset using index from router)
  ;; 4. multiply queue-index by slice-count and load into TB.

  (with-mm-reg (index queue-index limit)

    ;; Load the recv-p slice into the sprint condition bit.
    (ui (mop-maddr (constant (eval *sr-recv-p*)))
	(sop :memory-bus :condition
	     :memory-bus-direction :write))

    (setreg index temp-dest)
    (setreg index (- index index-bits))

    ;; Load the index bits into TB and clear out the upper bits.
    (ucblock ()
      (ui (alu-simp (- index-bits 1))
	  (ybus-dest-sel dispatch))
      (ui (sop :float-bus :transposer-b
	       :float-bus-direction :read
	       :float-bus-pointer-control :post-clear)
	  (seq push t dispatch))
      (ui (mop-maddr (index ++))
	  (sop :memory-bus :transposer-b
	       :memory-bus-direction :write
	       :memory-bus-pointer-control :post-add)
	  (seq rfct))

      (ui (alu-simp (- (constant 19) index-bits))
	  (ybus-dest-sel dispatch))
      (ui (seq cjp addr-neg done-clearing))
      (ui (mop-maddr (constant (eval *sr-zero-maddr*)))
	  (seq push t dispatch))
      (ui (sop :memory-bus :transposer-b
	       :memory-bus-direction :write
	       :memory-bus-pointer-control :post-add)
	  (seq rfct))
      (label done-clearing))

    (load-sprint-rug-from-memory :vp-size (constant (eval *sr-notify-limit*)))
    (ui)				; Seems to be needed.

    ;; Gather the index.
    (ucblock-ns
      (load-sprint-rug-from-memory :vp-base (constant (eval *sr-notify-base*)))      

      (setreg q (constant 20))
      (setreg queue-index (constant (eval *sr-queue-index*)))
      (ui (setreg ARG1 queue-index)
	  (seq cjs t sprint-send-gather-1-small-message)))

    ;; Compare the index to the limit and store the result in the test flag.
    ;; Ie test-flag == 1 iff index < limit.
    (setreg limit (constant (eval *sr-queue-limit*)))
    (fast-compare queue-index limit 20)

    ;; Increment queue-index and load the result directly into TA.
    (fast-increment-and-load-ta queue-index 20)

    ;; Clear out the high 12 bits of TA.
    (ucblock-ns
      (ui (mop-maddr (constant (eval *sr-zero-maddr*)))
	  (sop :memory-bus :transposer-a
	       :memory-bus-direction :write
	       :memory-bus-pointer-control :post-add))
      (ui (sop :memory-bus :transposer-a
	       :memory-bus-direction :write
	       :memory-bus-pointer-control :post-add)
	  (seq push t 9))
      (ui (sop :memory-bus :transposer-a
	       :memory-bus-direction :write
	       :memory-bus-pointer-control :post-add)
	  (seq rfct)))

    ;; Deliver the updated index.
    (ui (seq cjs t sprint-send-deliver-1-message-from-ta))
    
    ;; Disable overflowing queues.
    (ucblock-ns

      ;; From above test-flag == 1 iff queue-index < limit.  And the test-flag into the recv-p
      ;; slice and update the condition register.  NB: This can't be moved above the
      ;; delivery because the counts must be incremented even if they overflow.
      (lls ((and :b :f) :f)
	   (constant (eval *sr-flipper-rotate*)) (constant (eval *sr-recv-p*))
	   test-flag dont-care :always)
      (ui (mop-maddr (constant (eval *sr-flipper-rotate*)))
	  (sop :memory-bus :condition
	       :memory-bus-direction :write)))
    
    ;; The offset is index-bits * bank-increment + count * element-increment.  We do
    ;; the computation in the wtl3164.  The increments are preloaded (along with a 1).
    ;; The index is in TB and 1+ count is in TA.  The wtl3164 register map is:
    ;;   r0		bank-increment
    ;;   r1		element-increment
    ;;   r2		1
    ;;   r3-4		temp1
    ;;   r5-6		temp2
    ;;   r7		#xfffff
    (ucblock-ns

      ;; Read from transposer slot 0 and logand with #xfffff.
      (ucblock-ns
	(ui (wtl3164-static-instruction :func :ilog :aain 0 :abin 0 :main 0 :mbin 1)
	    (lbl :sprint-lbl-data 0))
	(ui (sprint-rug :rug-read-write :write
			:register :pointer))
	(ui (lbl :sprint-lbl-data #.(* 33 16)))
	;; temp1[0] <- TB[0] logand #xfffff
	(ui (sop :float-bus :transposer-b
		 :float-bus-direction :read)
	    (wtl3164-dynamic-instruction :xcnt :load-int-x :baddr 7 :cdaddr 3))
	;; temp2[0] <- TA[0] logand #xfffff
	(ui (sop :float-bus :transposer-a
		 :float-bus-direction :read)
	    (wtl3164-dynamic-instruction :xcnt :load-int-x :baddr 7 :cdaddr 5)))

      ;; Read from transposer slot 16 and logand with #xfffff.
      (ucblock-ns
	(ui (sprint-rug :rug-read-write :write
			:register :pointer))
	(ui)
	;; temp1[1] <- TB[16] logand #xfffff
	(ui (sop :float-bus :transposer-b
		 :float-bus-direction :read)
	    (wtl3164-dynamic-instruction :xcnt :load-int-x :baddr 7 :cdaddr 4))
	;; temp2[1] <- TA[16] logand #xfffff
	(ui (sop :float-bus :transposer-a
		 :float-bus-direction :read)
	    (wtl3164-dynamic-instruction :xcnt :load-int-x :baddr 7 :cdaddr 6)))

      ;; temp1 <- temp1 * bank-increment
      (ucblock-ns
	(ui (wtl3164-static-instruction :func :fmulil :aain 0 :abin 0 :main 1 :mbin 1))
	(ui (wtl3164-dynamic-instruction :aaddr 0 :baddr 3 :cdaddr 3))
	(ui (wtl3164-dynamic-instruction :aaddr 0 :baddr 4 :cdaddr 4)))

      ;; temp2 <- temp2 - 1.
      (ucblock-ns
	(ui (wtl3164-static-instruction :func :faddi :aain 0 :abin 1 :main 1 :mbin 1))
	(ui (wtl3164-dynamic-instruction :aaddr 2 :baddr 5 :cdaddr 5))
	(ui (wtl3164-dynamic-instruction :aaddr 2 :baddr 6 :cdaddr 6)))

      ;; temp2 <- temp2 * element-increment
      (ucblock-ns
	(ui (wtl3164-static-instruction :func :fmulil :aain 0 :abin 0 :main 1 :mbin 1))
	(ui (wtl3164-dynamic-instruction :aaddr 1 :baddr 5 :cdaddr 5))
	(ui (wtl3164-dynamic-instruction :aaddr 1 :baddr 6 :cdaddr 6)))
      
      ;; temp1 <- temp1 + temp2
      (ucblock-ns
	(ui (wtl3164-static-instruction :func :faddi :aain 0 :abin 0 :main 1 :mbin 1))
	(ui (wtl3164-dynamic-instruction :aaddr 3 :baddr 5 :cdaddr 3))
	(ui (wtl3164-dynamic-instruction :aaddr 4 :baddr 6 :cdaddr 4)))

      ;; Put temp1 into TB[0,16] so the two values can be used as indirect addressing
      ;; offsets. 
      (ucblock-ns

	;; First write out the two results to TA[0,16].  Since they are written slicewise
	;; they will be read processorwise.  Leave both the TA and TB pointer at zero.
	(ucblock-ns
	  (ui (lbl :sprint-lbl-data 0)
	      (wtl3164-static-instruction :func :monadic))
	  (ui (sprint-rug :rug-read-write :write
			  :register :pointer)
	      (wtl3164-dynamic-instruction :xcnt :store-int-reg
					   :baddr :nop-recr
					   :efaddr 3))
	  (ui (lbl :sprint-lbl-data 16))
	  (ui (sop :float-bus :transposer-a
		   :float-bus-direction :write))
	  (ui (sprint-rug :rug-read-write :write
			  :register :pointer)
	      (wtl3164-dynamic-instruction :xcnt :store-int-reg
					   :baddr :nop-recr
					   :efaddr 4))
	  (ui (alu-simp (constant (eval *stq-efaddr-base*)) (limit $exp)))
	  (ui (alu-simp (constant (eval *stq-efaddr-stride*)) (Q $exp))
	      (sop :float-bus :transposer-a
		   :float-bus-direction :write
		   :float-bus-pointer-control :post-clear)))

	;; To use the values as indirect addressing offsets they must be read slicewise.
	;; Thus we have to transpose the contents of TA into TB.
	(ucblock-ns

	  ;; Load TA into the register file.  Rather than having 20 instructions, we loop
	  ;; and use fparg-instr to specify the register to load.  We have initialized limit
	  ;; with the starting efaddr (12) and Q with the efaddr stride (1).
	  (ucblock-ns
	    (ui (seq push t 19))
	    (ucblock-ns
	      (ui (sop :float-bus :transposer-a
		       :float-bus-direction :read
		       :float-bus-pointer-control :post-add)
		  (alu-simp (+ limit Q) (y limit) (limit $exp))
		  (fparg-instr t)
		  (wtl3164-dynamic-instruction :xcnt :load-int-reg
					       :efaddr 0
					       :baddr :nop-recr)
		  (seq rfct))))

	  ;; Write the register file back to TB.  We use the same arg-instr technique to
	  ;; avoid writing out 20 instructions.
	  (ucblock-ns
	    (ui (alu-simp (constant (eval *stq-efaddr-base*)) (limit $exp)))
	    (ui (lit-vert alu) (lit-16-sf 17)
		(dbus-src-sel lit)
		(ybus-src-sel bypass)
		(ybus-dest-sel dispatch))
	    (ui (alu-simp (+ limit Q) (y limit) (limit $exp))
		(fparg-instr t)
		(wtl3164-dynamic-instruction :xcnt :store-int-reg
					     :efaddr 0
					     :baddr :nop-recr))
	    (ui (alu-simp (+ limit Q) (y limit) (limit $exp))
		(fparg-instr t)
		(wtl3164-dynamic-instruction :xcnt :store-int-reg
					     :efaddr 0
					     :baddr :nop-recr)
		(seq push t dispatch))
	    (ucblock-ns
	      (ui (sop :float-bus :transposer-b
		       :float-bus-direction :write
		       :float-bus-pointer-control :post-add)
		  (alu-simp (+ limit Q) (y limit) (limit $exp))
		  (fparg-instr t)
		  (wtl3164-dynamic-instruction :xcnt :store-int-reg
					       :efaddr 0
					       :baddr :nop-recr)
		  (seq rfct)))
	    (ui (sop :float-bus :transposer-b
		     :float-bus-direction :write
		     :float-bus-pointer-control :post-add))
	    (ui (sop :float-bus :transposer-b
		     :float-bus-direction :write
		     :float-bus-pointer-control :post-add))))))

    ;; Load the element bound.
    (load-sprint-rug-from-memory :vp-size (constant (eval *sr-element-limit*)))

    ;; Initialize the element base pointer.
    (setreg current-dest-base (constant (eval *sr-dest-base-table*)))))

(cmi::increment-patch-level 9)
