/**
 * Copyright (C) 2017-2018 Xilinx, Inc
 * Author: Sonal Santan
 * AWS HAL Driver layered on top of kernel drivers
 *
 * Code copied from SDAccel XDMA based HAL driver
 *
 * Licensed under the Apache License, Version 2.0 (the "License"). You may
 * not use this file except in compliance with the License. A copy of the
 * License is located at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */
#ifndef _XDMA_SHIM_H_
#define _XDMA_SHIM_H_

#include "xclhal.h"
#include "xclperf.h"
#include "drm.h"
#include <fstream>
#include <list>
#include <map>
#include <vector>
#include <string>
#include <mutex>
#include <cassert>

#ifndef INTERNAL_TESTING
#include "fpga_pci.h"
#include "fpga_mgmt.h"
#endif

// Work around GCC 4.8 + XDMA BAR implementation bugs
// With -O3 PCIe BAR read/write are not reliable hence force -O2 as max
// optimization level for pcieBarRead() and pcieBarWrite()
#if defined(__GNUC__) && defined(NDEBUG)
#define SHIM_O2 __attribute__ ((optimize("-O2")))
#else
#define SHIM_O2
#endif

namespace awsbwhal {


struct AddresRange;

std::ostream& operator<< (std::ostream &strm, const AddresRange &rng);

/**
 * Simple tuple struct to store non overlapping address ranges: address and size
 */
struct AddresRange : public std::pair<uint64_t, size_t> {
  // size will be zero when we are looking up an address that was passed by the user
  AddresRange(uint64_t addr, size_t size = 0) : std::pair<uint64_t, size_t>(std::make_pair(addr, size)) {
    //std::cout << "CTOR(" << addr << ',' << size << ")\n";
  }
  AddresRange(AddresRange && rhs) : std::pair<uint64_t, size_t>(std::move(rhs)) {
    //std::cout << "MOVE CTOR(" << rhs.first << ',' << rhs.second << ")\n";
  }

  AddresRange(const AddresRange &rhs) = delete;
  AddresRange& operator=(const AddresRange &rhs) = delete;

  // Comparison operator is useful when using AddressRange as a key in std::map
  // Note one operand in the comparator may have only the address without the size
  // However both operands in the comparator will not have zero size
  bool operator < (const AddresRange& other) const {
    //std::cout << *this << " < " << other << "\n";
    if ((this->second != 0) && (other.second != 0))
      // regular ranges
      return (this->first < other.first);
    if (other.second == 0)
      // second range just has an address
      // (1000, 100) < (1200, 0)
      // (1000, 100) < (1100, 0) first range ends at 1099
      return ((this->first + this->second) <= other.first);
    assert(this->second == 0);
    // this range just has an address
    // (1100, 0) < (1200, 100)
    return (this->first < other.first);
  }
};

/**
 * Simple map of address range to its bo handle and mapped virtual address
 */
static const std::pair<unsigned, char *> mNullValue = std::make_pair(0xffffffff, nullptr);
class RangeTable {
  std::map<AddresRange, std::pair<unsigned, char *>> mTable;
  mutable std::mutex mMutex;
public:
  void insert(uint64_t addr, size_t size, std::pair<unsigned, char *> bo) {
    // assert(find(addr) == 0xffffffff);
    std::lock_guard<std::mutex> lock(mMutex);
    mTable[AddresRange(addr, size)] = bo;
  }

  std::pair<unsigned, char *> erase(uint64_t addr) {
    std::lock_guard<std::mutex> lock(mMutex);
    std::map<AddresRange, std::pair<unsigned, char *>>::const_iterator i = mTable.find(AddresRange(addr));
    if (i == mTable.end())
      return mNullValue;
    std::pair<unsigned, char *> result = i->second;
    mTable.erase(i);
    return result;
  }

  std::pair<unsigned, char *> find(uint64_t addr) const {
    std::lock_guard<std::mutex> lock(mMutex);
    std::map<AddresRange, std::pair<unsigned, char *>>::const_iterator i = mTable.find(AddresRange(addr));
    if (i == mTable.end())
      return mNullValue;
    return i->second;
  }
};


    // Memory alignment for DDR and AXI-MM trace access
    template <typename T> class AlignedAllocator {
        void *mBuffer;
        size_t mCount;
    public:
        T *getBuffer() {
            return (T *)mBuffer;
        }

        size_t size() const {
            return mCount * sizeof(T);
        }

        AlignedAllocator(size_t alignment, size_t count) : mBuffer(0), mCount(count) {
            if (posix_memalign(&mBuffer, alignment, count * sizeof(T))) {
                mBuffer = 0;
            }
        }
        ~AlignedAllocator() {
            if (mBuffer)
                free(mBuffer);
        }
    };

    const uint64_t mNullAddr = 0xffffffffffffffffull;
    const uint64_t mNullBO = 0xffffffff;

    // XDMA Shim
    class AwsXcl{

        struct ELARecord {
            unsigned mStartAddress;
            unsigned mEndAddress;
            unsigned mDataCount;

            std::streampos mDataPos;
            ELARecord() : mStartAddress(0), mEndAddress(0),
                          mDataCount(0), mDataPos(0) {}
        };

        typedef std::list<ELARecord> ELARecordList;

        typedef std::list<std::pair<uint64_t, uint64_t> > PairList;

    public:
        //Sarab: Added for HAL2 XOCL Driver support
        //int xclGetErrorStatus(xclErrorStatus *info); Not supported for AWS
        bool xclUnlockDevice();
        unsigned int xclAllocBO(size_t size, xclBOKind domain, unsigned flags);
        unsigned int xclAllocUserPtrBO(void *userptr, size_t size, unsigned flags);
        void xclFreeBO(unsigned int boHandle);
        int xclWriteBO(unsigned int boHandle,
            const void *src, size_t size, size_t seek);
        int xclReadBO(unsigned int boHandle,
            void *dst, size_t size, size_t skip);
        void *xclMapBO(unsigned int boHandle, bool write);
        int xclSyncBO(unsigned int boHandle, xclBOSyncDirection dir,
            size_t size, size_t offset);
        int xclExportBO(unsigned int boHandle);
        unsigned int xclImportBO(int fd, unsigned flags);
        int xclGetBOProperties(unsigned int boHandle, xclBOProperties *properties);
        ssize_t xclUnmgdPread(unsigned flags, void *buf,
            size_t count, uint64_t offset);
        ssize_t xclUnmgdPwrite(unsigned flags, const void *buf,
            size_t count, uint64_t offset);


        // Bitstreams
        int xclGetXclBinIdFromSysfs(uint64_t &xclbinid);
        int xclLoadXclBin(const xclBin *buffer);
        int xclLoadAxlf(const axlf *buffer);
        int xclUpgradeFirmware(const char *fileName);
        int xclUpgradeFirmware2(const char *file1, const char* file2);
        //int xclUpgradeFirmwareXSpi(const char *fileName, int device_index=0); Not supported by AWS
        int xclTestXSpi(int device_index);
        int xclBootFPGA();
        int xclRemoveAndScanFPGA();
        int resetDevice(xclResetKind kind);
        int xclReClock2(unsigned short region, const unsigned short *targetFreqMHz);

        // Raw read/write
        size_t xclWrite(xclAddressSpace space, uint64_t offset, const void *hostBuf, size_t size);
        size_t xclRead(xclAddressSpace space, uint64_t offset, void *hostBuf, size_t size);

        // Buffer management
        uint64_t xclAllocDeviceBuffer(size_t size);
        uint64_t xclAllocDeviceBuffer2(size_t size, xclMemoryDomains domain, unsigned flags);
        void xclFreeDeviceBuffer(uint64_t buf);
        size_t xclCopyBufferHost2Device(uint64_t dest, const void *src, size_t size, size_t seek);
        size_t xclCopyBufferDevice2Host(void *dest, uint64_t src, size_t size, size_t skip);

        // Performance monitoring
        // Control
        double xclGetDeviceClockFreqMHz();
        double xclGetReadMaxBandwidthMBps();
        double xclGetWriteMaxBandwidthMBps();
        //void xclSetOclRegionProfilingNumberSlots(uint32_t numSlots);
        void xclSetProfilingNumberSlots(xclPerfMonType type, uint32_t numSlots);
        size_t xclPerfMonClockTraining(xclPerfMonType type);
        // Counters
        size_t xclPerfMonStartCounters(xclPerfMonType type);
        size_t xclPerfMonStopCounters(xclPerfMonType type);
        size_t xclPerfMonReadCounters(xclPerfMonType type, xclCounterResults& counterResults);
        //debug related
        uint32_t getCheckerNumberSlots(int type);
        uint32_t getIPCountAddrNames(int type, uint64_t *baseAddress, std::string * portNames);
        size_t xclDebugReadCounters(xclDebugCountersResults* debugResult);
        size_t xclDebugReadCheckers(xclDebugCheckersResults* checkerResult);
        void readDebugIpLayout();

        // Trace
        size_t xclPerfMonStartTrace(xclPerfMonType type, uint32_t startTrigger);
        size_t xclPerfMonStopTrace(xclPerfMonType type);
        uint32_t xclPerfMonGetTraceCount(xclPerfMonType type);
        size_t xclPerfMonReadTrace(xclPerfMonType type, xclTraceResultsVector& traceVector);

        // Sanity checks
        int xclGetDeviceInfo2(xclDeviceInfo2 *info);
        static AwsXcl *handleCheck(void *handle);
        static unsigned xclProbe();
        bool xclLockDevice();
        unsigned getTAG() const {
            return mTag;
        }
        bool isGood() const;

        ~AwsXcl();
        AwsXcl(unsigned index, const char *logfileName, xclVerbosityLevel verbosity);

    private:

        size_t xclReadModifyWrite(uint64_t offset, const void *hostBuf, size_t size);
        size_t xclReadSkipCopy(uint64_t offset, void *hostBuf, size_t size);
        bool zeroOutDDR();

        bool isXPR() const {
          return ((mDeviceInfo.mSubsystemId >> 12) == 4);
        }

        bool isMultipleOCLClockSupported() {
          unsigned dsaNum = ((mDeviceInfo.mDeviceId << 16) | mDeviceInfo.mSubsystemId);
          // 0x82384431 : TUL KU115 4ddr 3.1 DSA
          return ((dsaNum == 0x82384431)  || (dsaNum == 0x82384432))? true : false;
        }

        bool isUltraScale() const {
            return (mDeviceInfo.mDeviceId & 0x8000);
        }

        // Core DMA code
        SHIM_O2 int pcieBarRead(int bar_num, unsigned long long offset, void* buffer, unsigned long long length);
        SHIM_O2 int pcieBarWrite(int bar_num, unsigned long long offset, const void* buffer, unsigned long long length);
        int freezeAXIGate();
        int freeAXIGate();

        // PROM flashing
        int prepare(unsigned startAddress, unsigned endAddress);
        int program(std::ifstream& mcsStream, const ELARecord& record);
        int program(std::ifstream& mcsStream);
        int waitForReady(unsigned code, bool verbose = true);
        int waitAndFinish(unsigned code, unsigned data, bool verbose = true);

        //XSpi flashing.
        bool prepareXSpi();
        int programXSpi(std::ifstream& mcsStream, const ELARecord& record);
        int programXSpi(std::ifstream& mcsStream);
        bool waitTxEmpty();
        bool isFlashReady();
        //bool windDownWrites();
        bool bulkErase();
        bool sectorErase(unsigned Addr);
        bool writeEnable();
#if 0
	bool dataTransfer(bool read);
#endif
        bool readPage(unsigned addr, uint8_t readCmd = 0xff);
        bool writePage(unsigned addr, uint8_t writeCmd = 0xff);
        unsigned readReg(unsigned offset);
        int writeReg(unsigned regOffset, unsigned value);
        bool finalTransfer(uint8_t *sendBufPtr, uint8_t *recvBufPtr, int byteCount);
        bool getFlashId();
        //All remaining read /write register commands can be issued through this function.
        bool readRegister(unsigned commandCode, unsigned bytes);
        bool writeRegister(unsigned commandCode, unsigned value, unsigned bytes);
        bool select4ByteAddressMode();
        bool deSelect4ByteAddressMode();


        // Performance monitoring helper functions
        bool isDSAVersion(unsigned majorVersion, unsigned minorVersion, bool onlyThisVersion);
        unsigned getBankCount();
        uint64_t getHostTraceTimeNsec();
        uint64_t getPerfMonBaseAddress(xclPerfMonType type, uint32_t slotNum);
        uint64_t getPerfMonFifoBaseAddress(xclPerfMonType type, uint32_t fifonum);
        uint64_t getPerfMonFifoReadBaseAddress(xclPerfMonType type, uint32_t fifonum);
        uint32_t getPerfMonNumberSlots(xclPerfMonType type);
        uint32_t getPerfMonNumberSamples(xclPerfMonType type);
        uint32_t getPerfMonNumberFifos(xclPerfMonType type);
        uint32_t getPerfMonByteScaleFactor(xclPerfMonType type);
        uint8_t  getPerfMonShowIDS(xclPerfMonType type);
        uint8_t  getPerfMonShowLEN(xclPerfMonType type);
        uint32_t getPerfMonSlotStartBit(xclPerfMonType type, uint32_t slotnum);
        uint32_t getPerfMonSlotDataWidth(xclPerfMonType type, uint32_t slotnum);
        size_t resetFifos(xclPerfMonType type);
        uint32_t bin2dec(std::string str, int start, int number);
        uint32_t bin2dec(const char * str, int start, int number);
        std::string dec2bin(uint32_t n);
        std::string dec2bin(uint32_t n, unsigned bits);
        static std::string getDSAName(unsigned short deviceId, unsigned short subsystemId);

    private:
        // This is a hidden signature of this class and helps in preventing
        // user errors when incorrect pointers are passed in as handles.
        const unsigned mTag;
        const int mBoardNumber;
        const size_t maxDMASize;
        bool mLocked;
        const uint64_t mOffsets[XCL_ADDR_SPACE_MAX];
        int mUserHandle;
#ifdef INTERNAL_TESTING
        int mMgtHandle;
#else
        pci_bar_handle_t ocl_kernel_bar;     // AppPF BAR0 for OpenCL kernels
        pci_bar_handle_t sda_mgmt_bar;       // MgmtPF BAR4, for SDAccel Perf mon etc
        pci_bar_handle_t ocl_global_mem_bar; // AppPF BAR4
#endif
        uint32_t mMemoryProfilingNumberSlots;
        uint32_t mOclRegionProfilingNumberSlots;
        std::string mDevUserName;

        // Information extracted from platform linker
        bool mIsDebugIpLayoutRead = false;
        bool mIsDeviceProfiling = false;
        uint64_t mPerfMonFifoCtrlBaseAddress;
        uint64_t mPerfMonFifoReadBaseAddress;
        uint64_t mPerfMonBaseAddress[XSPM_MAX_NUMBER_SLOTS];
        std::string mPerfMonSlotName[XSPM_MAX_NUMBER_SLOTS];

        char *mUserMap;
        std::ofstream mLogStream;
        xclVerbosityLevel mVerbosity;
        std::string mBinfile;
        ELARecordList mRecordList;
        xclDeviceInfo2 mDeviceInfo;
        RangeTable mLegacyAddressTable;

#ifndef INTERNAL_TESTING
        int sleepUntilLoaded( std::string afi );
        int checkAndSkipReload( char *afi_id, fpga_mgmt_image_info *info );
        int loadDefaultAfiIfCleared( void );
#endif
    public:
        static const unsigned TAG;
    };
}

#endif