CoCalc -- codesizeprediction.py

GitHub Repository: Roblox/luau
Path: blob/master/tools/codesizeprediction.py
²⁷²³ views
1
#!/usr/bin/python3
2
# This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
3

4
# NOTE: This script is experimental. This script uses a linear regression to construct a model for predicting native
5
# code size from bytecode. Some initial work has been done to analyze a large corpus of Luau scripts, and while for
6
# most functions the model predicts the native code size quite well (+/-25%), there are many cases where the predicted
7
# size is off by as much as 13x. Notably, the predicted size is generally better for smaller functions and worse for 
8
# larger functions. Therefore, in its current form this analysis is probably not suitable for use as a basis for 
9
# compilation heuristics. A nonlinear model may produce better results. The script here exists as a foundation for 
10
# further exploration.
11

12

13
import json
14
import glob
15
from pathlib import Path
16
import pandas as pd
17
import numpy as np
18
from sklearn.linear_model import LinearRegression
19
import matplotlib.pyplot as plt
20
import argparse
21

22

23
def readStats(statsFileGlob):
24
    '''Reads files matching the supplied glob.
25
    Files should be generated by the Compile.cpp CLI'''
26

27
    statsFiles = glob.glob(statsFileGlob, recursive=True)
28

29
    print("Reading %s files." % len(statsFiles))
30

31
    df_dict = {
32
        "statsFile": [],
33
        "script": [],
34
        "name": [],
35
        "line": [],
36
        "bcodeCount": [],
37
        "irCount": [],
38
        "asmCount": [],
39
        "bytecodeSummary": []
40
    }
41

42
    for statsFile in statsFiles:
43
        stats = json.loads(Path(statsFile).read_text())
44
        for script, filestats in stats.items():
45
            for funstats in filestats["lowerStats"]["functions"]:
46
                df_dict["statsFile"].append(statsFile)
47
                df_dict["script"].append(script)
48
                df_dict["name"].append(funstats["name"])
49
                df_dict["line"].append(funstats["line"])
50
                df_dict["bcodeCount"].append(funstats["bcodeCount"])
51
                df_dict["irCount"].append(funstats["irCount"])
52
                df_dict["asmCount"].append(funstats["asmCount"])
53
                df_dict["bytecodeSummary"].append(
54
                    tuple(funstats["bytecodeSummary"][0]))
55

56
    return pd.DataFrame.from_dict(df_dict)
57

58

59
def addFunctionCount(df):
60
    df2 = df.drop_duplicates(subset=['asmCount', 'bytecodeSummary'], ignore_index=True).groupby(
61
        ['bytecodeSummary']).size().reset_index(name='functionCount')
62
    return df.merge(df2, on='bytecodeSummary', how='left')
63

64
# def deduplicateDf(df):
65
#    return df.drop_duplicates(subset=['bcodeCount', 'asmCount', 'bytecodeSummary'], ignore_index=True)
66

67

68
def randomizeDf(df):
69
    return df.sample(frac=1)
70

71

72
def splitSeq(seq):
73
    n = len(seq) // 2
74
    return (seq[:n], seq[n:])
75

76

77
def trainAsmSizePredictor(df):
78
    XTrain, XValidate = splitSeq(
79
        np.array([list(seq) for seq in df.bytecodeSummary]))
80
    YTrain, YValidate = splitSeq(np.array(df.asmCount))
81

82
    reg = LinearRegression(
83
        positive=True, fit_intercept=False).fit(XTrain, YTrain)
84
    YPredict1 = reg.predict(XTrain)
85
    YPredict2 = reg.predict(XValidate)
86

87
    trainRmse = np.sqrt(np.mean((np.array(YPredict1) - np.array(YTrain))**2))
88
    predictRmse = np.sqrt(
89
        np.mean((np.array(YPredict2) - np.array(YValidate))**2))
90

91
    print(f"Score: {reg.score(XTrain, YTrain)}")
92
    print(f"Training RMSE: {trainRmse}")
93
    print(f"Prediction RMSE: {predictRmse}")
94
    print(f"Model Intercept: {reg.intercept_}")
95
    print(f"Model Coefficients:\n{reg.coef_}")
96

97
    df.loc[:, 'asmCountPredicted'] = np.concatenate(
98
        (YPredict1, YPredict2)).round().astype(int)
99
    df['usedForTraining'] = np.concatenate(
100
        (np.repeat(True, YPredict1.size), np.repeat(False, YPredict2.size)))
101
    df['diff'] = df['asmCountPredicted'] - df['asmCount']
102
    df['diffPerc'] = (100 * df['diff']) / df['asmCount']
103
    df.loc[(df["diffPerc"] == np.inf), 'diffPerc'] = 0.0
104
    df['diffPerc'] = df['diffPerc'].round()
105

106
    return (reg, df)
107

108

109
def saveModel(reg, file):
110
    f = open(file, "w")
111
    f.write(f"Intercept: {reg.intercept_}\n")
112
    f.write(f"Coefficients: \n{reg.coef_}\n")
113
    f.close()
114

115

116
def bcodeVsAsmPlot(df, plotFile=None, minBcodeCount=None, maxBcodeCount=None):
117
    if minBcodeCount is None:
118
        minBcodeCount = df.bcodeCount.min()
119
    if maxBcodeCount is None:
120
        maxBcodeCount = df.bcodeCount.max()
121

122
    subDf = df[(df.bcodeCount <= maxBcodeCount) &
123
               (df.bcodeCount >= minBcodeCount)]
124

125
    plt.scatter(subDf.bcodeCount, subDf.asmCount)
126
    plt.title("ASM variation by Bytecode")
127
    plt.xlabel("Bytecode Instruction Count")
128
    plt.ylabel("ASM Instruction Count")
129

130
    if plotFile is not None:
131
        plt.savefig(plotFile)
132

133
    return plt
134

135

136
def predictionErrorPlot(df, plotFile=None, minPerc=None, maxPerc=None, bins=200):
137
    if minPerc is None:
138
        minPerc = df['diffPerc'].min()
139
    if maxPerc is None:
140
        maxPerc = df['diffPerc'].max()
141

142
    plotDf = df[(df["usedForTraining"] == False) & (
143
        df["diffPerc"] >= minPerc) & (df["diffPerc"] <= maxPerc)]
144

145
    plt.hist(plotDf["diffPerc"], bins=bins)
146
    plt.title("Prediction Error Distribution")
147
    plt.xlabel("Prediction Error %")
148
    plt.ylabel("Function Count")
149

150
    if plotFile is not None:
151
        plt.savefig(plotFile)
152

153
    return plt
154

155

156
def parseArgs():
157
    parser = argparse.ArgumentParser(
158
        prog='codesizeprediction.py',
159
        description='Constructs a linear regression model to predict native instruction count from bytecode opcode distribution')
160
    parser.add_argument("fileglob",
161
                        help="glob pattern for stats files to be used for training")
162
    parser.add_argument("modelfile",
163
                        help="text file to save model details")
164
    parser.add_argument("--nativesizefig",
165
                        help="path for saving the plot showing the variation of native code size with bytecode")
166
    parser.add_argument("--predictionerrorfig",
167
                        help="path for saving the plot showing the distribution of prediction error")
168
    return parser.parse_args()
169

170

171
if __name__ == "__main__":
172
    args = parseArgs()
173

174
    df0 = readStats(args.fileglob)
175
    df1 = addFunctionCount(df0)
176
    df2 = randomizeDf(df1)
177

178
    plt = bcodeVsAsmPlot(df2, args.nativesizefig, 0, 100)
179
    plt.show()
180

181
    (reg, df4) = trainAsmSizePredictor(df2)
182
    saveModel(reg, args.modelfile)
183

184
    plt = predictionErrorPlot(df4, args.predictionerrorfig, -200, 200)
185
    plt.show()
186

187
Product

Resources

Company