C minifier with Clang

I recently revamped Competitive programming in Nim. In short, I can create a C amalgamation from a Nim program and submit the C source code to various competitive programming websites.

Then I use a Clang based tool to shorten the C source code. It does two things:

  • Shorten function, variables, and type names
  • Use the clangFormat library to remove some whitespace

For the first step, the tool uses a derived ASTFrontendAction to traverse the AST twice, one for collecting function/var/type names and the other for renaming. Building clang::CompilerInstance from command lines needs some boilerplate. An alternative is to use clang::tooling::CommonOptionsParser and clang::tooling::ClangTool.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
/*
* MiniASTConsumer collects identifiers in `used` and rename candidates (in the main file) in `d2name`.
* MiniASTConsumer iterates over `d2name` and assigns new names.
* Renamer creates clang::tooling::Replacement instances.
* HandleTranslationUnit calls clang::tooling::applyAllReplacements.
*/

#include <clang/AST/ASTConsumer.h>
#include <clang/AST/Decl.h>
#include <clang/AST/RecursiveASTVisitor.h>
#include <clang/Basic/FileManager.h>
#include <clang/Basic/LangOptions.h>
#include <clang/Basic/SourceManager.h>
#include <clang/Basic/TargetInfo.h>
#include <clang/Driver/Action.h>
#include <clang/Driver/Compilation.h>
#include <clang/Driver/Driver.h>
#include <clang/Driver/Tool.h>
#include <clang/Format/Format.h>
#include <clang/Frontend/CompilerInstance.h>
#include <clang/Frontend/FrontendAction.h>
#include <clang/Lex/Lexer.h>
#include <clang/Lex/PreprocessorOptions.h>
#include <clang/Tooling/Core/Replacement.h>
#include <llvm/ADT/CachedHashString.h>
#include <llvm/ADT/DenseSet.h>
#include <llvm/ADT/MapVector.h>
#include <llvm/ADT/STLExtras.h>
#include <llvm/Support/Host.h>
#include <llvm/Support/Path.h>
#include <llvm/Support/raw_ostream.h>

#include <memory>
#include <vector>

#include <assert.h>
#include <err.h>
#include <unistd.h>

using namespace clang;
using namespace llvm;

namespace {
std::unique_ptr<CompilerInvocation> buildCompilerInvocation(ArrayRef<const char *> args) {
IntrusiveRefCntPtr<DiagnosticsEngine> diags(
CompilerInstance::createDiagnostics(new DiagnosticOptions, new IgnoringDiagConsumer, true));

driver::Driver d(args[0], llvm::sys::getDefaultTargetTriple(), *diags, "cminify", llvm::vfs::getRealFileSystem());
d.setCheckInputsExist(false);
std::unique_ptr<driver::Compilation> comp(d.BuildCompilation(args));
if (!comp)
return nullptr;
const driver::JobList &jobs = comp->getJobs();
if (jobs.size() != 1 || !isa<driver::Command>(*jobs.begin()))
return nullptr;

const driver::Command &cmd = cast<driver::Command>(*jobs.begin());
if (StringRef(cmd.getCreator().getName()) != "clang")
return nullptr;
const llvm::opt::ArgStringList &cc_args = cmd.getArguments();
auto ci = std::make_unique<CompilerInvocation>();
if (!CompilerInvocation::CreateFromArgs(*ci, cc_args, *diags))
return nullptr;

ci->getDiagnosticOpts().IgnoreWarnings = true;
ci->getFrontendOpts().DisableFree = false;
return ci;
}

SmallVector<StringRef, 0> ignores;
MapVector<Decl *, std::string> d2name;
DenseSet<CachedHashStringRef> used;
std::string newCode;

struct Collector : RecursiveASTVisitor<Collector> {
SourceManager &sm;

Collector(ASTContext &ctx) : sm(ctx.getSourceManager()) {}
bool VisitFunctionDecl(FunctionDecl *fd) {
if (fd->isOverloadedOperator() || !fd->getIdentifier())
return true;
used.insert(CachedHashStringRef(fd->getName()));
if (!fd->isDefined())
return true;
std::string name = fd->getNameAsString();
if (sm.isWrittenInMainFile(fd->getLocation())) {
if (!is_contained(ignores, name))
d2name[fd->getCanonicalDecl()] = "_f";
for (ParmVarDecl *param : fd->parameters())
VisitVarDecl(param);
}
return true;
}
bool VisitVarDecl(VarDecl *vd) {
if (!vd->getIdentifier())
return true;
used.insert(CachedHashStringRef(vd->getName()));
auto kind = vd->isThisDeclarationADefinition();
if (kind != VarDecl::Definition || !sm.isWrittenInMainFile(vd->getLocation()))
return true;
d2name[vd->getCanonicalDecl()] = "_v";
return true;
}

bool VisitTagDecl(TagDecl *td) {
used.insert(CachedHashStringRef(td->getName()));
if (!td->isThisDeclarationADefinition() || !sm.isWrittenInMainFile(td->getLocation()))
return true;
d2name[td->getCanonicalDecl()] = "_t";
return true;
}
bool VisitTypedefNameDecl(TypedefNameDecl *d) {
if (d->isTransparentTag() || !sm.isWrittenInMainFile(d->getLocation()))
return true;
d2name[d->getCanonicalDecl()] = "_t";
return true;
}
};

struct Renamer : RecursiveASTVisitor<Renamer> {
SourceManager &sm;
tooling::Replacements &reps;

Renamer(ASTContext &ctx, tooling::Replacements &reps) : sm(ctx.getSourceManager()), reps(reps) {}
void replace(CharSourceRange csr, StringRef newText) { cantFail(reps.add(tooling::Replacement(sm, csr, newText))); }

bool VisitFunctionDecl(FunctionDecl *fd) {
auto *canon = fd->getCanonicalDecl();
auto it = d2name.find(canon);
if (it != d2name.end())
replace(CharSourceRange::getTokenRange(fd->getLocation()), it->second);
return true;
}
bool VisitVarDecl(VarDecl *vd) {
auto *canon = vd->getCanonicalDecl();
auto it = d2name.find(canon);
if (it != d2name.end())
replace(CharSourceRange::getTokenRange(vd->getLocation()), it->second);
return true;
}
bool VisitDeclRefExpr(DeclRefExpr *dre) {
Decl *d = dre->getDecl();
if (!(isa<FunctionDecl>(d) || isa<VarDecl>(d)))
return true;
auto it = d2name.find(d->getCanonicalDecl());
if (it != d2name.end())
replace(CharSourceRange::getTokenRange(SourceRange(dre->getBeginLoc(), dre->getEndLoc())), it->second);
return true;
}

bool VisitTagDecl(TagDecl *d) {
auto *canon = d->getCanonicalDecl();
if (d->getTypedefNameForAnonDecl())
return true;
if (auto it = d2name.find(canon); it != d2name.end())
replace(CharSourceRange::getTokenRange(d->getLocation()), it->second);
return true;
}
bool VisitTagTypeLoc(TagTypeLoc tl) {
TagDecl *td = tl.getDecl()->getCanonicalDecl();
if (td->getTypedefNameForAnonDecl())
return true;
if (auto it = d2name.find(td); it != d2name.end())
replace(CharSourceRange::getTokenRange(tl.getNameLoc()), it->second);
return true;
}
bool VisitTypedefNameDecl(TypedefNameDecl *d) {
if (auto it = d2name.find(d->getCanonicalDecl()); it != d2name.end())
replace(CharSourceRange::getTokenRange(d->getLocation()), it->second);
return true;
}
bool VisitTypedefTypeLoc(TypedefTypeLoc tl) {
TypedefNameDecl *td = tl.getTypedefNameDecl();
if (auto it = d2name.find(td); it != d2name.end())
replace(CharSourceRange::getTokenRange(tl.getNameLoc()), it->second);
return true;
}
};

struct MiniASTConsumer : ASTConsumer {
ASTContext *ctx;
int n_fn = 0, n_var = 0, n_type = 0;

void Initialize(ASTContext &ctx) override { this->ctx = &ctx; }
static std::string getName(StringRef prefix, int &id) {
static const char digits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
std::string newName;
for (;;) {
newName = std::string(1, prefix[id % prefix.size()]);
if (int i = id / prefix.size())
while (newName += digits[i % 62], i /= 62);
id++;
if (!used.contains(CachedHashStringRef(newName))) break;
}
return newName;
}
bool HandleTopLevelDecl(DeclGroupRef dgr) override {
for (auto s : {"j0", "j1", "jn", "j0f", "j1f", "jnf", "j0l", "j1l", "jnl"})
used.insert(CachedHashStringRef(s));
for (auto s : {"y0", "y1", "yn", "y0f", "y1f", "ynf", "y0l", "y1l", "ynl"})
used.insert(CachedHashStringRef(s));

Collector c(*ctx);
for (Decl *d : dgr)
c.TraverseDecl(d);
for (auto &[d, name] : d2name) {
if (name == "_f")
name = getName("abcdefghijklm", n_fn);
else if (name == "_v") {
int old_n_var = n_var;
auto newName = getName("nopqrstuvwxyz", n_var);
if (newName.size() < static_cast<VarDecl *>(d)->getName().size())
name = newName;
else {
name = static_cast<VarDecl *>(d)->getName();
n_var = old_n_var;
}
} else if (name == "_t")
name = getName("ABCDEFGHIJKLMNOPQRSTUVWXYZ", n_type);
}
return true;
}
void HandleTranslationUnit(ASTContext &ctx) override {
tooling::Replacements reps;
Renamer c(ctx, reps);
c.TraverseDecl(ctx.getTranslationUnitDecl());

auto &sm = ctx.getSourceManager();
StringRef code = sm.getBufferData(sm.getMainFileID());
auto res = tooling::applyAllReplacements(code, reps);
if (!res)
errx(2, "failed to apply replacements: %s", toString(res.takeError()).c_str());
newCode = *res;
}
};

struct MiniAction : ASTFrontendAction {
std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &ci,
StringRef inFile) override {
return std::make_unique<MiniASTConsumer>();
}
};

void reformat() {
auto buf = MemoryBuffer::getMemBuffer(newCode, "", true);
format::FormatStyle style = cantFail(format::getStyle("LLVM", "-", "LLVM", newCode, nullptr));
style.ColumnLimit = 9999;
style.IndentWidth = 0;
style.ContinuationIndentWidth = 0;
style.SpaceBeforeAssignmentOperators = false;
style.SpaceBeforeParens = format::FormatStyle::SBPO_Never;
style.AlignEscapedNewlines = format::FormatStyle::ENAS_DontAlign;

format::FormattingAttemptStatus status;
std::vector<tooling::Range> ranges{{0, unsigned(newCode.size())}};
tooling::Replacements reps = format::reformat(style, newCode, ranges, "-", &status);
auto res = tooling::applyAllReplacements(newCode, reps);
if (!res)
errx(2, "failed to apply replacements: %s", toString(res.takeError()).c_str());
newCode = *res;
}
}

int main(int argc, char *argv[]) {
std::vector<const char *> args{argv[0], "-fsyntax-only"};
bool inplace = false;
const char *outfile = "/dev/stdout";
const char usage[] = R"(Usage: %s [-i] [-f fun]... a.c

Options:
-i edit a.c in place\n)";
for (int i = 1; i < argc; i++) {
StringRef opt(argv[i]);
if (opt[0] != '-')
args.push_back(argv[i]);
else if (opt == "-h") {
fputs(usage, stdout);
return 0;
} else if (opt == "-i")
inplace = true;
else if (opt == "-f" && i + 1 < argc)
ignores.push_back(argv[++i]);
else if (opt == "-o" && i + 1 < argc)
outfile = argv[++i];
else {
fputs(usage, stderr);
return 1;
}
}
ignores.push_back("main");

auto ci = buildCompilerInvocation(args);
if (!ci)
errx(1, "failed to build CompilerInvocation");

auto inst = std::make_unique<CompilerInstance>(std::make_shared<PCHContainerOperations>());
IgnoringDiagConsumer dc;
inst->setInvocation(std::move(ci));
inst->createDiagnostics(&dc, false);
inst->getDiagnostics().setIgnoreAllWarnings(true);
inst->setTarget(TargetInfo::CreateTargetInfo(inst->getDiagnostics(), inst->getInvocation().TargetOpts));
if (!inst->hasTarget())
errx(1, "hasTarget returns false");
inst->createFileManager(llvm::vfs::getRealFileSystem());
inst->setSourceManager(new SourceManager(inst->getDiagnostics(), inst->getFileManager(), true));

MiniAction action;
if (!action.BeginSourceFile(*inst, inst->getFrontendOpts().Inputs[0]))
errx(2, "failed to parse");
if (Error e = action.Execute())
errx(2, "failed to execute");
action.EndSourceFile();
reformat();

std::error_code ec;
raw_fd_ostream(inplace ? inst->getFrontendOpts().Inputs[0].getFile() : outfile, ec, sys::fs::OF_None) << newCode;
}

CMakeLists.txt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
cmake_minimum_required(VERSION 3.14)
project(cminify LANGUAGES C CXX)

add_executable(cminify "")
set(DEFAULT_CMAKE_BUILD_TYPE Release)
set_property(TARGET cminify PROPERTY CXX_STANDARD 17)
set_property(TARGET cminify PROPERTY CXX_STANDARD_REQUIRED ON)
set_property(TARGET cminify PROPERTY CXX_EXTENSIONS OFF)

find_package(Clang REQUIRED)

if(CLANG_LINK_CLANG_DYLIB)
target_link_libraries(cminify PRIVATE clang-cpp)
else()
target_link_libraries(cminify PRIVATE
clangIndex
clangFormat
clangTooling
clangToolingInclusions
clangToolingCore
clangFrontend
clangParse
clangSerialization
clangSema
clangAST
clangLex
clangDriver
clangBasic
)
endif()

if(LLVM_LINK_LLVM_DYLIB)
target_link_libraries(cminify PRIVATE LLVM)
else()
target_link_libraries(cminify PRIVATE LLVMOption LLVMSupport)
endif()

if(NOT LLVM_ENABLE_RTTI)
# releases.llvm.org libraries are compiled with -fno-rtti
# The mismatch between lib{clang,LLVM}* and cminify can make libstdc++ std::make_shared return nullptr
# _Sp_counted_ptr_inplace::_M_get_deleter
if(MSVC)
target_compile_options(cminify PRIVATE /GR-)
else()
target_compile_options(cminify PRIVATE -fno-rtti)
endif()
endif()

target_sources(cminify PRIVATE main.cc)

foreach(include_dir ${LLVM_INCLUDE_DIRS} ${CLANG_INCLUDE_DIRS})
get_filename_component(include_dir_realpath ${include_dir} REALPATH)
# Don't add as SYSTEM if they are in CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES.
# It would reorder the system search paths and cause issues with libstdc++'s
# use of #include_next. See https://github.com/MaskRay/ccls/pull/417
if(NOT "${include_dir_realpath}" IN_LIST CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES)
target_include_directories(cminify SYSTEM PRIVATE ${include_dir})
endif()
endforeach()

install(TARGETS cminify RUNTIME DESTINATION bin)

Define LLVM as the llvm-project repository and LLVMOUT as the build directory (make sure you have at least built these targets: ninja clang clangFormat clangIndex clangTooling).

1
2
cmake -GNinja -S. -Bout/release -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$LLVMOUT;$LLVMOUT/tools/clang;$LLVM/llvm;$LLVM/clang"
ninja -C out/release

If LLVM and Clang's CMake, library, and header files are installed in well-known locations, then -DCMAKE_PREFIX_PATH can be omitted.

It's certainly not straightforward to find all these APIs. I mainly use ccls as a reference which was inspired by clangIndex. For writing this tool, I read a bit code of clang-rename, clang-format, and C-Reduce clang_delta. C-Reduce provides clang_delta/RenameFun.cpp and two other passes (RenameVar, RenameParam) which do similar stuff. Its code was a bit old now as it was written based on a Clang in circa 2012.

Let's see an example. Unfortunately I don't find clangFormat options removing whitespace after = and ,. That can perhaps be done by a post-processing string substitution tool without introducing too much risk.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
% cat test/a.c
#include <stdint.h>
#include <string.h>

#pragma GCC diagnostic ignored "-Wpragmas"
static float foo(int, int);
static float foo(int aaa, int bbb) { aaa = 3; bbb = 5; return 1.0f; }

struct NimStrPayload;
typedef struct NimStrPayload NimStrPayload;
struct NimStringV2;
typedef struct NimStringV2 NimStringV2;

struct NimStrPayload { int64_t cap; char data[]; };
struct NimStringV2 { int64_t cap; NimStrPayload *p; };

#define XX NimStringV2

float goo() {
int u, v, w, x, y, z;
int s1, t1, u1, v1, w1, x1, y1, z1;
NimStringV2 s;
XX t;
return 1.0f;
}

int main() {
char a[10];
memset(a, 0, 10);
float _ = foo(3, 5) + goo();
}
% out/release/cminify test/a.c
#include <stdint.h>
#include <string.h>

#pragma GCC diagnostic ignored "-Wpragmas"
static float a(int, int);
static float a(int k, int l) {
k= 3;
l= 5;
return 1.0f;
}

struct C;
typedef struct C A;
struct D;
typedef struct D B;

struct C {
int64_t cap;
char data[];
};
struct D {
int64_t cap;
A *p;
};

#define XX B

float b() {
int u, v, w, x, y, z;
int m, n, o, p, q, r, y1, z1;
B s;
XX t;
return 1.0f;
}

int main() {
char a[10];
memset(a, 0, 10);
float _= a(3, 5) + b();
}